Spaces:
Sleeping
Sleeping
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.options import Options | |
from selenium.common.exceptions import TimeoutException, NoSuchElementException | |
import pandas as pd | |
import time | |
class ScrapingCarmudi: | |
def __init__(self, link, total_pages=20): | |
self.link = link | |
self.total_pages = total_pages | |
self.driver = None | |
self.wait = None | |
def webdriver_connect(self): | |
options = Options() | |
options.add_argument('--ignore-certificate-errors') | |
options.add_argument("--disable-web-security") | |
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
self.wait = WebDriverWait(self.driver, 20) | |
def lets_scraping_toweb(self): | |
self.webdriver_connect() | |
self.driver.get(self.link) | |
cari_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'search-button')]/button"))) | |
cari_button.click() | |
time.sleep(3) | |
df = pd.DataFrame({}) | |
index = 0 | |
try: | |
while index < self.total_pages: | |
print(f'Scraping page {index + 1}...') | |
self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text"))) | |
car_names = self.wait.until( | |
EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text")) | |
) | |
car_prices = self.wait.until( | |
EC.presence_of_all_elements_located((By.CLASS_NAME, "listing__price")) | |
) | |
for name, price in zip(car_names, car_prices): | |
car_name = name.text.strip() | |
car_price = price.text.strip() | |
print(car_name, car_price) | |
temp_df = pd.DataFrame({'car_names': [car_name], 'car_prices': [car_price]}) | |
df = pd.concat([df, temp_df], ignore_index=True) | |
try: | |
close_btn = WebDriverWait(self.driver, 5).until( | |
EC.element_to_be_clickable((By.CSS_SELECTOR, ".modal__destroy.b-close.close--menu")) | |
) | |
self.driver.execute_script("arguments[0].click();", close_btn) | |
print("Popup modal ditutup.") | |
except TimeoutException: | |
pass | |
try: | |
next_button = self.wait.until( | |
EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]/a")) | |
) | |
self.driver.execute_script("arguments[0].click();", next_button) | |
print("Klik tombol Selanjutnya.") | |
except (TimeoutException, NoSuchElementException): | |
print("Tombol 'Selanjutnya' tidak ditemukan, berhenti di halaman ini.") | |
break | |
time.sleep(3) | |
index += 1 | |
df.to_csv('results.csv', index=False) | |
finally: | |
df.to_csv('results.csv', index=False) | |
self.driver.quit() | |