Spaces:
Sleeping
Sleeping
akhfzl
commited on
Commit
·
52b9f60
0
Parent(s):
'add-to-files'
Browse files- .gitignore +1 -0
- .gradio/flagged/dataset1.csv +2 -0
- app.py +21 -0
- modeling/notebook-legoas-test.ipynb +0 -0
- modeling/results/eop_norm.pkl +0 -0
- modeling/results/pemrosesan-norm.csv +0 -0
- modeling/results/pemrosesan.csv +0 -0
- modeling/results/price_norm.pkl +0 -0
- requirements.txt +89 -0
- scrapping/main.py +5 -0
- scrapping/results.csv +0 -0
- scrapping/utils.py +85 -0
- utils.py +81 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
myenv
|
.gradio/flagged/dataset1.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Nama Mobil (<tahun> <merk> <model>),Harga Mobil (Rp.),output,timestamp
|
2 |
+
2022 Mitsubishi Expander Ultimate CVT,Rp. 331.950.000,Nama mobil harus berformat: <tahun> Mistubishi Pajero,2025-05-07 23:23:43.217620
|
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils import CarPricePrediction
|
3 |
+
|
4 |
+
demo = gr.Interface(
|
5 |
+
fn=CarPricePrediction,
|
6 |
+
inputs=[
|
7 |
+
gr.Textbox(label="Nama Mobil (<tahun> <merk> <model>)", placeholder="2020 Honda Brio"),
|
8 |
+
gr.Textbox(label="Harga Mobil (Rp.)", placeholder="Rp. 400.000.000 or Rp 400.000.000"),
|
9 |
+
gr.Textbox(label="Tahun Prediksi Penjualan", placeholder="Di tahun berapa mobil akan dijual")
|
10 |
+
],
|
11 |
+
outputs="text",
|
12 |
+
title="Prediksi Harga Mobil Bekas",
|
13 |
+
description=(
|
14 |
+
"Masukkan nama dan harga mobil.\n\n"
|
15 |
+
"- Format **Nama Mobil**: '2024 Mitsubishi Pajero'\n"
|
16 |
+
"- Format **Harga**: 'Rp. 400.000.000 atau Rp 400.000'\n"
|
17 |
+
)
|
18 |
+
)
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
demo.launch()
|
modeling/notebook-legoas-test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling/results/eop_norm.pkl
ADDED
Binary file (1.04 kB). View file
|
|
modeling/results/pemrosesan-norm.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling/results/pemrosesan.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling/results/price_norm.pkl
ADDED
Binary file (1.02 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==24.1.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.9.0
|
4 |
+
attrs==25.3.0
|
5 |
+
audioop-lts==0.2.1
|
6 |
+
bleach==6.2.0
|
7 |
+
certifi==2025.4.26
|
8 |
+
cffi==1.17.1
|
9 |
+
charset-normalizer==3.4.2
|
10 |
+
click==8.1.8
|
11 |
+
colorama==0.4.6
|
12 |
+
fastapi==0.115.12
|
13 |
+
ffmpy==0.5.0
|
14 |
+
filelock==3.18.0
|
15 |
+
fsspec==2025.3.2
|
16 |
+
gradio==5.29.0
|
17 |
+
gradio_client==1.10.0
|
18 |
+
groovy==0.1.2
|
19 |
+
h11==0.16.0
|
20 |
+
httpcore==1.0.9
|
21 |
+
httpx==0.28.1
|
22 |
+
huggingface-hub==0.31.0
|
23 |
+
idna==3.10
|
24 |
+
Jinja2==3.1.6
|
25 |
+
joblib==1.5.0
|
26 |
+
kaggle==1.7.4.2
|
27 |
+
markdown-it-py==3.0.0
|
28 |
+
MarkupSafe==3.0.2
|
29 |
+
mdurl==0.1.2
|
30 |
+
mpmath==1.3.0
|
31 |
+
networkx==3.4.2
|
32 |
+
numpy==2.2.5
|
33 |
+
orjson==3.10.18
|
34 |
+
outcome==1.3.0.post0
|
35 |
+
packaging==25.0
|
36 |
+
pandas==2.2.3
|
37 |
+
pillow==11.2.1
|
38 |
+
protobuf==6.30.2
|
39 |
+
pycparser==2.22
|
40 |
+
pydantic==2.11.4
|
41 |
+
pydantic_core==2.33.2
|
42 |
+
pydub==0.25.1
|
43 |
+
Pygments==2.19.1
|
44 |
+
PySocks==1.7.1
|
45 |
+
python-dateutil==2.9.0.post0
|
46 |
+
python-dotenv==1.1.0
|
47 |
+
python-multipart==0.0.20
|
48 |
+
python-slugify==8.0.4
|
49 |
+
pytz==2025.2
|
50 |
+
PyYAML==6.0.2
|
51 |
+
regex==2024.11.6
|
52 |
+
requests==2.32.3
|
53 |
+
rich==14.0.0
|
54 |
+
ruff==0.11.8
|
55 |
+
safehttpx==0.1.6
|
56 |
+
safetensors==0.5.3
|
57 |
+
scikit-learn==1.6.1
|
58 |
+
scipy==1.15.2
|
59 |
+
selenium==4.32.0
|
60 |
+
semantic-version==2.10.0
|
61 |
+
sentence-transformers==4.1.0
|
62 |
+
setuptools==80.3.1
|
63 |
+
shellingham==1.5.4
|
64 |
+
six==1.17.0
|
65 |
+
sniffio==1.3.1
|
66 |
+
sortedcontainers==2.4.0
|
67 |
+
starlette==0.46.2
|
68 |
+
sympy==1.14.0
|
69 |
+
text-unidecode==1.3
|
70 |
+
threadpoolctl==3.6.0
|
71 |
+
tokenizers==0.21.1
|
72 |
+
tomlkit==0.13.2
|
73 |
+
torch==2.7.0
|
74 |
+
tqdm==4.67.1
|
75 |
+
transformers==4.51.3
|
76 |
+
trio==0.30.0
|
77 |
+
trio-websocket==0.12.2
|
78 |
+
typer==0.15.3
|
79 |
+
typing-inspection==0.4.0
|
80 |
+
typing_extensions==4.13.2
|
81 |
+
tzdata==2025.2
|
82 |
+
urllib3==2.4.0
|
83 |
+
uvicorn==0.34.2
|
84 |
+
webdriver-manager==4.0.2
|
85 |
+
webencodings==0.5.1
|
86 |
+
websocket-client==1.8.0
|
87 |
+
websockets==15.0.1
|
88 |
+
wheel==0.45.1
|
89 |
+
wsproto==1.2.0
|
scrapping/main.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils import ScrapingCarmudi
|
2 |
+
|
3 |
+
# if __name__ == '__main__':
|
4 |
+
# scrap_object = ScrapingCarmudi('https://www.carmudi.co.id/', 495)
|
5 |
+
# print(scrap_object.lets_scraping_toweb())
|
scrapping/results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scrapping/utils.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.service import Service
|
3 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.webdriver.chrome.options import Options
|
8 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
|
12 |
+
class ScrapingCarmudi:
|
13 |
+
def __init__(self, link, total_pages=20):
|
14 |
+
self.link = link
|
15 |
+
self.total_pages = total_pages
|
16 |
+
self.driver = None
|
17 |
+
self.wait = None
|
18 |
+
|
19 |
+
def webdriver_connect(self):
|
20 |
+
options = Options()
|
21 |
+
options.add_argument('--ignore-certificate-errors')
|
22 |
+
options.add_argument("--disable-web-security")
|
23 |
+
|
24 |
+
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
25 |
+
self.wait = WebDriverWait(self.driver, 20)
|
26 |
+
|
27 |
+
def lets_scraping_toweb(self):
|
28 |
+
self.webdriver_connect()
|
29 |
+
self.driver.get(self.link)
|
30 |
+
|
31 |
+
cari_button = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'search-button')]/button")))
|
32 |
+
cari_button.click()
|
33 |
+
|
34 |
+
time.sleep(3)
|
35 |
+
|
36 |
+
df = pd.DataFrame({})
|
37 |
+
index = 0
|
38 |
+
|
39 |
+
try:
|
40 |
+
while index < self.total_pages:
|
41 |
+
print(f'Scraping page {index + 1}...')
|
42 |
+
|
43 |
+
self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text")))
|
44 |
+
|
45 |
+
car_names = self.wait.until(
|
46 |
+
EC.presence_of_all_elements_located((By.CLASS_NAME, "js-ellipsize-text"))
|
47 |
+
)
|
48 |
+
car_prices = self.wait.until(
|
49 |
+
EC.presence_of_all_elements_located((By.CLASS_NAME, "listing__price"))
|
50 |
+
)
|
51 |
+
|
52 |
+
for name, price in zip(car_names, car_prices):
|
53 |
+
car_name = name.text.strip()
|
54 |
+
car_price = price.text.strip()
|
55 |
+
print(car_name, car_price)
|
56 |
+
|
57 |
+
temp_df = pd.DataFrame({'car_names': [car_name], 'car_prices': [car_price]})
|
58 |
+
df = pd.concat([df, temp_df], ignore_index=True)
|
59 |
+
|
60 |
+
try:
|
61 |
+
close_btn = WebDriverWait(self.driver, 5).until(
|
62 |
+
EC.element_to_be_clickable((By.CSS_SELECTOR, ".modal__destroy.b-close.close--menu"))
|
63 |
+
)
|
64 |
+
self.driver.execute_script("arguments[0].click();", close_btn)
|
65 |
+
print("Popup modal ditutup.")
|
66 |
+
except TimeoutException:
|
67 |
+
pass
|
68 |
+
|
69 |
+
try:
|
70 |
+
next_button = self.wait.until(
|
71 |
+
EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'next')]/a"))
|
72 |
+
)
|
73 |
+
self.driver.execute_script("arguments[0].click();", next_button)
|
74 |
+
print("Klik tombol Selanjutnya.")
|
75 |
+
except (TimeoutException, NoSuchElementException):
|
76 |
+
print("Tombol 'Selanjutnya' tidak ditemukan, berhenti di halaman ini.")
|
77 |
+
break
|
78 |
+
|
79 |
+
time.sleep(3)
|
80 |
+
index += 1
|
81 |
+
|
82 |
+
df.to_csv('results.csv', index=False)
|
83 |
+
finally:
|
84 |
+
df.to_csv('results.csv', index=False)
|
85 |
+
self.driver.quit()
|
utils.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, os
|
2 |
+
import pandas as pd
|
3 |
+
import joblib
|
4 |
+
import numpy as np
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
7 |
+
|
8 |
+
kaggle_secret = os.getenv("KAGGLE_JSON")
|
9 |
+
os.makedirs("/root/.kaggle", exist_ok=True)
|
10 |
+
with open("/root/.kaggle/kaggle.json", "w") as f:
|
11 |
+
f.write(kaggle_secret)
|
12 |
+
os.chmod("/root/.kaggle/kaggle.json", 0o600)
|
13 |
+
|
14 |
+
api = KaggleApi()
|
15 |
+
api.authenticate()
|
16 |
+
|
17 |
+
MODEL_PATH = "model.pkl"
|
18 |
+
if not os.path.exists(MODEL_PATH):
|
19 |
+
api.dataset_download_file("myfaizal/model", file_name="model.pkl", path=".", unzip=True)
|
20 |
+
|
21 |
+
|
22 |
+
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
|
23 |
+
|
24 |
+
def CarPricePrediction(car_names, car_prices, car_sold):
|
25 |
+
# inisialisasi
|
26 |
+
p_scaler = joblib.load('modeling/results/price_norm.pkl')
|
27 |
+
eop_scaler = joblib.load('modeling/results/eop_norm.pkl')
|
28 |
+
model = joblib.load(MODEL_PATH)
|
29 |
+
|
30 |
+
pattern = re.compile(r"^Rp\.?\s?\d{1,3}(\.\d{3})+$")
|
31 |
+
name_pattern = re.compile(r"^(19[8-9][0-9]|20[0-2][0-9]|2025)\s+\w+(\s+\w+)+$", re.IGNORECASE)
|
32 |
+
|
33 |
+
names = car_names.strip()
|
34 |
+
prices = car_prices.strip()
|
35 |
+
car_sold = int(car_sold)
|
36 |
+
|
37 |
+
if not pattern.match(prices):
|
38 |
+
return 'Harga harus berformat rupiah misal: Rp. 2.000.000 atau Rp 2.000.000'
|
39 |
+
|
40 |
+
if not name_pattern.match(names):
|
41 |
+
return 'Nama mobil harus berformat: <tahun> Mistubishi Pajero'
|
42 |
+
|
43 |
+
test = {
|
44 |
+
'car_names': [names],
|
45 |
+
'car_prices': [prices]
|
46 |
+
}
|
47 |
+
|
48 |
+
test_df = pd.DataFrame(test)
|
49 |
+
|
50 |
+
# feature engineering
|
51 |
+
test_df['year'] = test_df['car_names'].str.extract(r'(\b\d{4}\b)').astype(int)
|
52 |
+
|
53 |
+
if car_sold < test_df['year'].tolist()[0]:
|
54 |
+
return "Tidak bisa tahun penjualan kurang dari tahun pembelian"
|
55 |
+
|
56 |
+
test_df['car_names_clean'] = test_df['car_names'].str.replace(r'^\d{4}\s+', '', regex=True).str.split(r' - ').str[0].str.strip()
|
57 |
+
test_df['estimated_original_price'] = test_df['car_prices'].str.replace(r'\D', '', regex=True).apply(lambda x: int(x) if x else None)
|
58 |
+
test_df['ages'] = test_df['year'].apply(lambda x: car_sold - int(x))
|
59 |
+
|
60 |
+
test_df['eop_norm'] = eop_scaler.transform(test_df[['estimated_original_price']])
|
61 |
+
|
62 |
+
car_names_embeddings = bert_model.encode(test_df['car_names_clean'].tolist(), show_progress_bar=True)
|
63 |
+
car_names_df = pd.DataFrame(car_names_embeddings, columns=[f'bert_{i}' for i in range(car_names_embeddings.shape[1])])
|
64 |
+
|
65 |
+
test_input = pd.concat([car_names_df, test_df[['eop_norm', 'ages']].reset_index(drop=True)], axis=1)
|
66 |
+
test_result = model.predict(test_input)
|
67 |
+
test_result = p_scaler.inverse_transform(np.array(test_result).reshape(-1, 1))
|
68 |
+
test_result = test_result.flatten()
|
69 |
+
|
70 |
+
test_result_str = [f"Rp. {format(round(n), ',').replace(',', '.')}" for n in test_result]
|
71 |
+
|
72 |
+
test_df['deprecate_percentage'] = ((test_df['estimated_original_price'] - round(test_result[0])) / test_df['estimated_original_price'])
|
73 |
+
|
74 |
+
return f"""
|
75 |
+
Informasi penjualan:
|
76 |
+
- {car_names}
|
77 |
+
- Harga pembelian: {car_prices}
|
78 |
+
- Harga penjualan: {test_result_str[0]}
|
79 |
+
- Tahun penjualan: {car_sold}
|
80 |
+
- Depresiasi sebesar: {round(test_df['deprecate_percentage'].tolist()[0]*100)}%
|
81 |
+
"""
|