|
|
from selenium import webdriver |
|
|
from selenium.common.exceptions import WebDriverException |
|
|
from selenium.webdriver.chrome.service import Service |
|
|
from selenium.webdriver.chrome.options import Options |
|
|
from bs4 import BeautifulSoup |
|
|
import time |
|
|
|
|
|
def scrape_website(website): |
|
|
print("Connecting to Chrome Browser...") |
|
|
|
|
|
|
|
|
options = Options() |
|
|
options.add_argument("--headless") |
|
|
options.add_argument('--no-sandbox') |
|
|
options.add_argument('--disable-dev-shm-usage') |
|
|
|
|
|
|
|
|
wd = None |
|
|
try: |
|
|
wd = webdriver.Chrome(options=options) |
|
|
wd.set_window_size(1080, 720) |
|
|
wd.get(website) |
|
|
wd.implicitly_wait(10) |
|
|
print("Waiting for CAPTCHA to be solved manually (if present)...") |
|
|
|
|
|
|
|
|
while "captcha" in wd.page_source.lower(): |
|
|
print("CAPTCHA detected, waiting...") |
|
|
time.sleep(5) |
|
|
|
|
|
print("CAPTCHA solved or not present. Scraping page content...") |
|
|
html = wd.page_source |
|
|
return html |
|
|
|
|
|
except WebDriverException as e: |
|
|
print(f"WebDriverException occurred: {e}") |
|
|
return None |
|
|
|
|
|
finally: |
|
|
if wd: |
|
|
wd.quit() |
|
|
|
|
|
def extract_body_content(html_content): |
|
|
if html_content is None: |
|
|
return "" |
|
|
soup = BeautifulSoup(html_content, "html.parser") |
|
|
body_content = soup.body |
|
|
return str(body_content) if body_content else "" |
|
|
|
|
|
def clean_body_content(body_content): |
|
|
soup = BeautifulSoup(body_content, "html.parser") |
|
|
|
|
|
for script_or_style in soup(["script", "style"]): |
|
|
script_or_style.extract() |
|
|
|
|
|
cleaned_content = soup.get_text(separator="\n") |
|
|
cleaned_content = "\n".join( |
|
|
line.strip() for line in cleaned_content.splitlines() if line.strip() |
|
|
) |
|
|
|
|
|
return cleaned_content |
|
|
|
|
|
def split_dom_content(dom_content, max_length=6000): |
|
|
return [dom_content[i:i + max_length] for i in range(0, len(dom_content), max_length)] |
|
|
|