Spaces:

akhfzl
/

legoas-price-predict

Sleeping

App Files Files Community

akhfzl commited on May 7

Commit

52b9f60

0 Parent(s):

'add-to-files'

Browse files

Files changed (13) hide show

.gitignore +1 -0
.gradio/flagged/dataset1.csv +2 -0
app.py +21 -0
modeling/notebook-legoas-test.ipynb +0 -0
modeling/results/eop_norm.pkl +0 -0
modeling/results/pemrosesan-norm.csv +0 -0
modeling/results/pemrosesan.csv +0 -0
modeling/results/price_norm.pkl +0 -0
requirements.txt +89 -0
scrapping/main.py +5 -0
scrapping/results.csv +0 -0
scrapping/utils.py +85 -0
utils.py +81 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ myenv

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Nama Mobil (<tahun> <merk> <model>),Harga Mobil (Rp.),output,timestamp
2	+ 2022 Mitsubishi Expander Ultimate CVT,Rp. 331.950.000,Nama mobil harus berformat: <tahun> Mistubishi Pajero,2025-05-07 23:23:43.217620

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import gradio as gr
+from utils import CarPricePrediction
+demo = gr.Interface(
+    fn=CarPricePrediction,
+    inputs=[
+        gr.Textbox(label="Nama Mobil (<tahun> <merk> <model>)", placeholder="2020 Honda Brio"),
+        gr.Textbox(label="Harga Mobil (Rp.)", placeholder="Rp. 400.000.000 or Rp 400.000.000"),
+        gr.Textbox(label="Tahun Prediksi Penjualan", placeholder="Di tahun berapa mobil akan dijual")
+    ],
+    outputs="text",
+    title="Prediksi Harga Mobil Bekas",
+    description=(
+        "Masukkan nama dan harga mobil.\n\n"
+        "- Format **Nama Mobil**: '2024 Mitsubishi Pajero'\n"
+        "- Format **Harga**: 'Rp. 400.000.000 atau Rp 400.000'\n"
+    )
+)
+if __name__ == "__main__":
+    demo.launch()

modeling/notebook-legoas-test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling/results/eop_norm.pkl ADDED Viewed

Binary file (1.04 kB). View file

modeling/results/pemrosesan-norm.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling/results/pemrosesan.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling/results/price_norm.pkl ADDED Viewed

Binary file (1.02 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,89 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioop-lts==0.2.1
+bleach==6.2.0
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.1.8
+colorama==0.4.6
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fsspec==2025.3.2
+gradio==5.29.0
+gradio_client==1.10.0
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.31.0
+idna==3.10
+Jinja2==3.1.6
+joblib==1.5.0
+kaggle==1.7.4.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.5
+orjson==3.10.18
+outcome==1.3.0.post0
+packaging==25.0
+pandas==2.2.3
+pillow==11.2.1
+protobuf==6.30.2
+pycparser==2.22
+pydantic==2.11.4
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+python-slugify==8.0.4
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.8
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+selenium==4.32.0
+semantic-version==2.10.0
+sentence-transformers==4.1.0
+setuptools==80.3.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+starlette==0.46.2
+sympy==1.14.0
+text-unidecode==1.3
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+tqdm==4.67.1
+transformers==4.51.3
+trio==0.30.0
+trio-websocket==0.12.2
+typer==0.15.3
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+webdriver-manager==4.0.2
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+wheel==0.45.1
+wsproto==1.2.0

scrapping/main.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from utils import ScrapingCarmudi
+# if __name__ == '__main__':
+    # scrap_object = ScrapingCarmudi('https://www.carmudi.co.id/', 495)
+    # print(scrap_object.lets_scraping_toweb())

scrapping/results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

scrapping/utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+import pandas as pd
+import time
+class ScrapingCarmudi:
+    def __init__(self, link, total_pages=20):
+        self.link = link
+        self.total_pages = total_pages
+        self.driver = None
+        self.wait = None
+    def webdriver_connect(self):
+        options = Options()
+        options.add_argument('--ignore-certificate-errors')
+        options.add_argument("--disable-web-security")
+        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+        self.wait = WebDriverWait(self.driver, 20)
+    def lets_scraping_toweb(self):
+        self.webdriver_connect()
+        self.driver.get(self.link)
+        cari_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'search-button')]/button")))
+        cari_button.click()
+        time.sleep(3)
+        df = pd.DataFrame({})
+        index = 0
+        try:
+            while index < self.total_pages:
+                print(f'Scraping page {index + 1}...')
+                self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text")))
+                car_names = self.wait.until(
+                    EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text"))
+                )
+                car_prices = self.wait.until(
+                    EC.presence_of_all_elements_located((By.CLASS_NAME, "listing__price"))
+                )
+                for name, price in zip(car_names, car_prices):
+                    car_name = name.text.strip()
+                    car_price = price.text.strip()
+                    print(car_name, car_price)
+                    temp_df = pd.DataFrame({'car_names': [car_name], 'car_prices': [car_price]})
+                    df = pd.concat([df, temp_df], ignore_index=True)
+                try:
+                    close_btn = WebDriverWait(self.driver, 5).until(
+                        EC.element_to_be_clickable((By.CSS_SELECTOR, ".modal__destroy.b-close.close--menu"))
+                    )
+                    self.driver.execute_script("arguments[0].click();", close_btn)
+                    print("Popup modal ditutup.")
+                except TimeoutException:
+                    pass
+                try:
+                    next_button = self.wait.until(
+                        EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]/a"))
+                    )
+                    self.driver.execute_script("arguments[0].click();", next_button)
+                    print("Klik tombol Selanjutnya.")
+                except (TimeoutException, NoSuchElementException):
+                    print("Tombol 'Selanjutnya' tidak ditemukan, berhenti di halaman ini.")
+                    break
+                time.sleep(3)
+                index += 1
+            df.to_csv('results.csv', index=False)
+        finally:
+            df.to_csv('results.csv', index=False)
+            self.driver.quit()

utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import re, os
+import pandas as pd
+import joblib
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from kaggle.api.kaggle_api_extended import KaggleApi
+kaggle_secret = os.getenv("KAGGLE_JSON")
+os.makedirs("/root/.kaggle", exist_ok=True)
+with open("/root/.kaggle/kaggle.json", "w") as f:
+    f.write(kaggle_secret)
+os.chmod("/root/.kaggle/kaggle.json", 0o600)
+api = KaggleApi()
+api.authenticate()
+MODEL_PATH = "model.pkl"
+if not os.path.exists(MODEL_PATH):
+    api.dataset_download_file("myfaizal/model", file_name="model.pkl", path=".", unzip=True)
+bert_model = SentenceTransformer('all-MiniLM-L6-v2')
+def CarPricePrediction(car_names, car_prices, car_sold):
+    # inisialisasi
+    p_scaler = joblib.load('modeling/results/price_norm.pkl')
+    eop_scaler = joblib.load('modeling/results/eop_norm.pkl')
+    model = joblib.load(MODEL_PATH)
+    pattern = re.compile(r"^Rp\.?\s?\d{1,3}(\.\d{3})+$")
+    name_pattern = re.compile(r"^(19[8-9][0-9]|20[0-2][0-9]|2025)\s+\w+(\s+\w+)+$", re.IGNORECASE)
+    names = car_names.strip()
+    prices = car_prices.strip()
+    car_sold = int(car_sold)
+    if not pattern.match(prices):
+        return 'Harga harus berformat rupiah misal: Rp. 2.000.000 atau Rp 2.000.000'
+    if not name_pattern.match(names):
+        return 'Nama mobil harus berformat: <tahun> Mistubishi Pajero'
+    test = {
+        'car_names': [names],
+        'car_prices': [prices]
+    }
+    test_df = pd.DataFrame(test)
+    # feature engineering
+    test_df['year'] = test_df['car_names'].str.extract(r'(\b\d{4}\b)').astype(int)
+    if car_sold < test_df['year'].tolist()[0]:
+        return "Tidak bisa tahun penjualan kurang dari tahun pembelian"
+    test_df['car_names_clean'] = test_df['car_names'].str.replace(r'^\d{4}\s+', '', regex=True).str.split(r' - ').str[0].str.strip()
+    test_df['estimated_original_price'] = test_df['car_prices'].str.replace(r'\D', '', regex=True).apply(lambda x: int(x) if x else None)
+    test_df['ages'] = test_df['year'].apply(lambda x: car_sold - int(x))
+    test_df['eop_norm'] = eop_scaler.transform(test_df[['estimated_original_price']])
+    car_names_embeddings = bert_model.encode(test_df['car_names_clean'].tolist(), show_progress_bar=True)
+    car_names_df = pd.DataFrame(car_names_embeddings, columns=[f'bert_{i}' for i in range(car_names_embeddings.shape[1])])
+    test_input = pd.concat([car_names_df, test_df[['eop_norm', 'ages']].reset_index(drop=True)], axis=1)
+    test_result = model.predict(test_input)
+    test_result = p_scaler.inverse_transform(np.array(test_result).reshape(-1, 1))
+    test_result = test_result.flatten()
+    test_result_str =  [f"Rp. {format(round(n), ',').replace(',', '.')}" for n in test_result]
+    test_df['deprecate_percentage'] = ((test_df['estimated_original_price'] - round(test_result[0])) / test_df['estimated_original_price'])
+    return f"""
+        Informasi penjualan:
+            - {car_names}
+            - Harga pembelian: {car_prices}
+            - Harga penjualan: {test_result_str[0]}
+            - Tahun penjualan: {car_sold}
+            - Depresiasi sebesar: {round(test_df['deprecate_percentage'].tolist()[0]*100)}%
+    """