AI-WebScraper-App / scrape.py
usmanyousaf's picture
Update scrape.py
135b855 verified
raw
history blame
2.31 kB
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
def scrape_website(website):
print("Connecting to Chrome Browser...")
# Setup ChromeDriver options
options = Options()
options.add_argument("--headless") # Run in headless mode for deployment
options.add_argument('--no-sandbox') # Overcome limited resource problems
options.add_argument('--disable-dev-shm-usage') # Overcome limited resource problems
# Initialize the driver without a specified service (assumes ChromeDriver is in PATH)
wd = None
try:
wd = webdriver.Chrome(options=options)
wd.set_window_size(1080, 720) # Set the window size
wd.get(website)
wd.implicitly_wait(10)
print("Waiting for CAPTCHA to be solved manually (if present)...")
# Optional waiting loop for manual CAPTCHA solving
while "captcha" in wd.page_source.lower():
print("CAPTCHA detected, waiting...")
time.sleep(5)
print("CAPTCHA solved or not present. Scraping page content...")
html = wd.page_source
return html
except WebDriverException as e:
print(f"WebDriverException occurred: {e}")
return None # Return None or an empty string based on your requirement
finally:
if wd:
wd.quit()
def extract_body_content(html_content):
if html_content is None:
return "" # Return empty if there is no content
soup = BeautifulSoup(html_content, "html.parser")
body_content = soup.body
return str(body_content) if body_content else ""
def clean_body_content(body_content):
soup = BeautifulSoup(body_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
def split_dom_content(dom_content, max_length=6000):
return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)]