akhfzl
'add-to-files'
52b9f60
raw
history blame
3.4 kB
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
class ScrapingCarmudi:
def __init__(self, link, total_pages=20):
self.link = link
self.total_pages = total_pages
self.driver = None
self.wait = None
def webdriver_connect(self):
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--disable-web-security")
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.wait = WebDriverWait(self.driver, 20)
def lets_scraping_toweb(self):
self.webdriver_connect()
self.driver.get(self.link)
cari_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'search-button')]/button")))
cari_button.click()
time.sleep(3)
df = pd.DataFrame({})
index = 0
try:
while index < self.total_pages:
print(f'Scraping page {index + 1}...')
self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text")))
car_names = self.wait.until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text"))
)
car_prices = self.wait.until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "listing__price"))
)
for name, price in zip(car_names, car_prices):
car_name = name.text.strip()
car_price = price.text.strip()
print(car_name, car_price)
temp_df = pd.DataFrame({'car_names': [car_name], 'car_prices': [car_price]})
df = pd.concat([df, temp_df], ignore_index=True)
try:
close_btn = WebDriverWait(self.driver, 5).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".modal__destroy.b-close.close--menu"))
)
self.driver.execute_script("arguments[0].click();", close_btn)
print("Popup modal ditutup.")
except TimeoutException:
pass
try:
next_button = self.wait.until(
EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]/a"))
)
self.driver.execute_script("arguments[0].click();", next_button)
print("Klik tombol Selanjutnya.")
except (TimeoutException, NoSuchElementException):
print("Tombol 'Selanjutnya' tidak ditemukan, berhenti di halaman ini.")
break
time.sleep(3)
index += 1
df.to_csv('results.csv', index=False)
finally:
df.to_csv('results.csv', index=False)
self.driver.quit()