akhfzl commited on
Commit
52b9f60
·
0 Parent(s):

'add-to-files'

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ myenv
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Nama Mobil (<tahun> <merk> <model>),Harga Mobil (Rp.),output,timestamp
2
+ 2022 Mitsubishi Expander Ultimate CVT,Rp. 331.950.000,Nama mobil harus berformat: <tahun> Mistubishi Pajero,2025-05-07 23:23:43.217620
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import CarPricePrediction
3
+
4
+ demo = gr.Interface(
5
+ fn=CarPricePrediction,
6
+ inputs=[
7
+ gr.Textbox(label="Nama Mobil (<tahun> <merk> <model>)", placeholder="2020 Honda Brio"),
8
+ gr.Textbox(label="Harga Mobil (Rp.)", placeholder="Rp. 400.000.000 or Rp 400.000.000"),
9
+ gr.Textbox(label="Tahun Prediksi Penjualan", placeholder="Di tahun berapa mobil akan dijual")
10
+ ],
11
+ outputs="text",
12
+ title="Prediksi Harga Mobil Bekas",
13
+ description=(
14
+ "Masukkan nama dan harga mobil.\n\n"
15
+ "- Format **Nama Mobil**: '2024 Mitsubishi Pajero'\n"
16
+ "- Format **Harga**: 'Rp. 400.000.000 atau Rp 400.000'\n"
17
+ )
18
+ )
19
+
20
+ if __name__ == "__main__":
21
+ demo.launch()
modeling/notebook-legoas-test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
modeling/results/eop_norm.pkl ADDED
Binary file (1.04 kB). View file
 
modeling/results/pemrosesan-norm.csv ADDED
The diff for this file is too large to render. See raw diff
 
modeling/results/pemrosesan.csv ADDED
The diff for this file is too large to render. See raw diff
 
modeling/results/price_norm.pkl ADDED
Binary file (1.02 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ attrs==25.3.0
5
+ audioop-lts==0.2.1
6
+ bleach==6.2.0
7
+ certifi==2025.4.26
8
+ cffi==1.17.1
9
+ charset-normalizer==3.4.2
10
+ click==8.1.8
11
+ colorama==0.4.6
12
+ fastapi==0.115.12
13
+ ffmpy==0.5.0
14
+ filelock==3.18.0
15
+ fsspec==2025.3.2
16
+ gradio==5.29.0
17
+ gradio_client==1.10.0
18
+ groovy==0.1.2
19
+ h11==0.16.0
20
+ httpcore==1.0.9
21
+ httpx==0.28.1
22
+ huggingface-hub==0.31.0
23
+ idna==3.10
24
+ Jinja2==3.1.6
25
+ joblib==1.5.0
26
+ kaggle==1.7.4.2
27
+ markdown-it-py==3.0.0
28
+ MarkupSafe==3.0.2
29
+ mdurl==0.1.2
30
+ mpmath==1.3.0
31
+ networkx==3.4.2
32
+ numpy==2.2.5
33
+ orjson==3.10.18
34
+ outcome==1.3.0.post0
35
+ packaging==25.0
36
+ pandas==2.2.3
37
+ pillow==11.2.1
38
+ protobuf==6.30.2
39
+ pycparser==2.22
40
+ pydantic==2.11.4
41
+ pydantic_core==2.33.2
42
+ pydub==0.25.1
43
+ Pygments==2.19.1
44
+ PySocks==1.7.1
45
+ python-dateutil==2.9.0.post0
46
+ python-dotenv==1.1.0
47
+ python-multipart==0.0.20
48
+ python-slugify==8.0.4
49
+ pytz==2025.2
50
+ PyYAML==6.0.2
51
+ regex==2024.11.6
52
+ requests==2.32.3
53
+ rich==14.0.0
54
+ ruff==0.11.8
55
+ safehttpx==0.1.6
56
+ safetensors==0.5.3
57
+ scikit-learn==1.6.1
58
+ scipy==1.15.2
59
+ selenium==4.32.0
60
+ semantic-version==2.10.0
61
+ sentence-transformers==4.1.0
62
+ setuptools==80.3.1
63
+ shellingham==1.5.4
64
+ six==1.17.0
65
+ sniffio==1.3.1
66
+ sortedcontainers==2.4.0
67
+ starlette==0.46.2
68
+ sympy==1.14.0
69
+ text-unidecode==1.3
70
+ threadpoolctl==3.6.0
71
+ tokenizers==0.21.1
72
+ tomlkit==0.13.2
73
+ torch==2.7.0
74
+ tqdm==4.67.1
75
+ transformers==4.51.3
76
+ trio==0.30.0
77
+ trio-websocket==0.12.2
78
+ typer==0.15.3
79
+ typing-inspection==0.4.0
80
+ typing_extensions==4.13.2
81
+ tzdata==2025.2
82
+ urllib3==2.4.0
83
+ uvicorn==0.34.2
84
+ webdriver-manager==4.0.2
85
+ webencodings==0.5.1
86
+ websocket-client==1.8.0
87
+ websockets==15.0.1
88
+ wheel==0.45.1
89
+ wsproto==1.2.0
scrapping/main.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from utils import ScrapingCarmudi
2
+
3
+ # if __name__ == '__main__':
4
+ # scrap_object = ScrapingCarmudi('https://www.carmudi.co.id/', 495)
5
+ # print(scrap_object.lets_scraping_toweb())
scrapping/results.csv ADDED
The diff for this file is too large to render. See raw diff
 
scrapping/utils.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from webdriver_manager.chrome import ChromeDriverManager
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.webdriver.chrome.options import Options
8
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
9
+ import pandas as pd
10
+ import time
11
+
12
+ class ScrapingCarmudi:
13
+ def __init__(self, link, total_pages=20):
14
+ self.link = link
15
+ self.total_pages = total_pages
16
+ self.driver = None
17
+ self.wait = None
18
+
19
+ def webdriver_connect(self):
20
+ options = Options()
21
+ options.add_argument('--ignore-certificate-errors')
22
+ options.add_argument("--disable-web-security")
23
+
24
+ self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
25
+ self.wait = WebDriverWait(self.driver, 20)
26
+
27
+ def lets_scraping_toweb(self):
28
+ self.webdriver_connect()
29
+ self.driver.get(self.link)
30
+
31
+ cari_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'search-button')]/button")))
32
+ cari_button.click()
33
+
34
+ time.sleep(3)
35
+
36
+ df = pd.DataFrame({})
37
+ index = 0
38
+
39
+ try:
40
+ while index < self.total_pages:
41
+ print(f'Scraping page {index + 1}...')
42
+
43
+ self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text")))
44
+
45
+ car_names = self.wait.until(
46
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text"))
47
+ )
48
+ car_prices = self.wait.until(
49
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "listing__price"))
50
+ )
51
+
52
+ for name, price in zip(car_names, car_prices):
53
+ car_name = name.text.strip()
54
+ car_price = price.text.strip()
55
+ print(car_name, car_price)
56
+
57
+ temp_df = pd.DataFrame({'car_names': [car_name], 'car_prices': [car_price]})
58
+ df = pd.concat([df, temp_df], ignore_index=True)
59
+
60
+ try:
61
+ close_btn = WebDriverWait(self.driver, 5).until(
62
+ EC.element_to_be_clickable((By.CSS_SELECTOR, ".modal__destroy.b-close.close--menu"))
63
+ )
64
+ self.driver.execute_script("arguments[0].click();", close_btn)
65
+ print("Popup modal ditutup.")
66
+ except TimeoutException:
67
+ pass
68
+
69
+ try:
70
+ next_button = self.wait.until(
71
+ EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]/a"))
72
+ )
73
+ self.driver.execute_script("arguments[0].click();", next_button)
74
+ print("Klik tombol Selanjutnya.")
75
+ except (TimeoutException, NoSuchElementException):
76
+ print("Tombol 'Selanjutnya' tidak ditemukan, berhenti di halaman ini.")
77
+ break
78
+
79
+ time.sleep(3)
80
+ index += 1
81
+
82
+ df.to_csv('results.csv', index=False)
83
+ finally:
84
+ df.to_csv('results.csv', index=False)
85
+ self.driver.quit()
utils.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, os
2
+ import pandas as pd
3
+ import joblib
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from kaggle.api.kaggle_api_extended import KaggleApi
7
+
8
+ kaggle_secret = os.getenv("KAGGLE_JSON")
9
+ os.makedirs("/root/.kaggle", exist_ok=True)
10
+ with open("/root/.kaggle/kaggle.json", "w") as f:
11
+ f.write(kaggle_secret)
12
+ os.chmod("/root/.kaggle/kaggle.json", 0o600)
13
+
14
+ api = KaggleApi()
15
+ api.authenticate()
16
+
17
+ MODEL_PATH = "model.pkl"
18
+ if not os.path.exists(MODEL_PATH):
19
+ api.dataset_download_file("myfaizal/model", file_name="model.pkl", path=".", unzip=True)
20
+
21
+
22
+ bert_model = SentenceTransformer('all-MiniLM-L6-v2')
23
+
24
+ def CarPricePrediction(car_names, car_prices, car_sold):
25
+ # inisialisasi
26
+ p_scaler = joblib.load('modeling/results/price_norm.pkl')
27
+ eop_scaler = joblib.load('modeling/results/eop_norm.pkl')
28
+ model = joblib.load(MODEL_PATH)
29
+
30
+ pattern = re.compile(r"^Rp\.?\s?\d{1,3}(\.\d{3})+$")
31
+ name_pattern = re.compile(r"^(19[8-9][0-9]|20[0-2][0-9]|2025)\s+\w+(\s+\w+)+$", re.IGNORECASE)
32
+
33
+ names = car_names.strip()
34
+ prices = car_prices.strip()
35
+ car_sold = int(car_sold)
36
+
37
+ if not pattern.match(prices):
38
+ return 'Harga harus berformat rupiah misal: Rp. 2.000.000 atau Rp 2.000.000'
39
+
40
+ if not name_pattern.match(names):
41
+ return 'Nama mobil harus berformat: <tahun> Mistubishi Pajero'
42
+
43
+ test = {
44
+ 'car_names': [names],
45
+ 'car_prices': [prices]
46
+ }
47
+
48
+ test_df = pd.DataFrame(test)
49
+
50
+ # feature engineering
51
+ test_df['year'] = test_df['car_names'].str.extract(r'(\b\d{4}\b)').astype(int)
52
+
53
+ if car_sold < test_df['year'].tolist()[0]:
54
+ return "Tidak bisa tahun penjualan kurang dari tahun pembelian"
55
+
56
+ test_df['car_names_clean'] = test_df['car_names'].str.replace(r'^\d{4}\s+', '', regex=True).str.split(r' - ').str[0].str.strip()
57
+ test_df['estimated_original_price'] = test_df['car_prices'].str.replace(r'\D', '', regex=True).apply(lambda x: int(x) if x else None)
58
+ test_df['ages'] = test_df['year'].apply(lambda x: car_sold - int(x))
59
+
60
+ test_df['eop_norm'] = eop_scaler.transform(test_df[['estimated_original_price']])
61
+
62
+ car_names_embeddings = bert_model.encode(test_df['car_names_clean'].tolist(), show_progress_bar=True)
63
+ car_names_df = pd.DataFrame(car_names_embeddings, columns=[f'bert_{i}' for i in range(car_names_embeddings.shape[1])])
64
+
65
+ test_input = pd.concat([car_names_df, test_df[['eop_norm', 'ages']].reset_index(drop=True)], axis=1)
66
+ test_result = model.predict(test_input)
67
+ test_result = p_scaler.inverse_transform(np.array(test_result).reshape(-1, 1))
68
+ test_result = test_result.flatten()
69
+
70
+ test_result_str = [f"Rp. {format(round(n), ',').replace(',', '.')}" for n in test_result]
71
+
72
+ test_df['deprecate_percentage'] = ((test_df['estimated_original_price'] - round(test_result[0])) / test_df['estimated_original_price'])
73
+
74
+ return f"""
75
+ Informasi penjualan:
76
+ - {car_names}
77
+ - Harga pembelian: {car_prices}
78
+ - Harga penjualan: {test_result_str[0]}
79
+ - Tahun penjualan: {car_sold}
80
+ - Depresiasi sebesar: {round(test_df['deprecate_percentage'].tolist()[0]*100)}%
81
+ """