Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
import joblib | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import hf_hub_download | |
from datetime import datetime | |
bert_model = SentenceTransformer('all-MiniLM-L6-v2') | |
MODEL_PATH = hf_hub_download(repo_id="akhfzl/legois-models", filename="model.pkl") | |
def CarPricePrediction(car_names, car_prices, car_sold): | |
# inisialisasi | |
p_scaler = joblib.load('modeling/results/price_norm.pkl') | |
eop_scaler = joblib.load('modeling/results/eop_norm.pkl') | |
model = joblib.load(MODEL_PATH) | |
pattern = re.compile(r"^Rp\.?\s?\d{1,3}(\.\d{3})+$") | |
name_pattern = re.compile(r"^(19[8-9][0-9]|20[0-2][0-9]|2025)\s+\w+(\s+\w+)+$", re.IGNORECASE) | |
names = car_names.strip() | |
prices = car_prices.strip() | |
car_sold = int(car_sold) if car_sold else datetime.now().year | |
if not pattern.match(prices): | |
return 'Harga harus berformat rupiah misal: Rp. 2.000.000 atau Rp 2.000.000' | |
if not name_pattern.match(names): | |
return 'Nama mobil harus berformat: <tahun> Mistubishi Pajero' | |
test = { | |
'car_names': [names], | |
'car_prices': [prices] | |
} | |
test_df = pd.DataFrame(test) | |
# feature engineering | |
test_df['year'] = test_df['car_names'].str.extract(r'(\b\d{4}\b)').astype(int) | |
if car_sold < test_df['year'].tolist()[0]: | |
return "Tidak bisa tahun penjualan kurang dari tahun pembelian" | |
test_df['car_names_clean'] = test_df['car_names'].str.replace(r'^\d{4}\s+', '', regex=True).str.split(r' - ').str[0].str.strip() | |
test_df['estimated_original_price'] = test_df['car_prices'].str.replace(r'\D', '', regex=True).apply(lambda x: int(x) if x else None) | |
test_df['ages'] = test_df['year'].apply(lambda x: car_sold - int(x)) | |
test_df['eop_norm'] = eop_scaler.transform(test_df[['estimated_original_price']]) | |
car_names_embeddings = bert_model.encode(test_df['car_names_clean'].tolist(), show_progress_bar=True) | |
car_names_df = pd.DataFrame(car_names_embeddings, columns=[f'bert_{i}' for i in range(car_names_embeddings.shape[1])]) | |
test_input = pd.concat([car_names_df, test_df[['eop_norm', 'ages']].reset_index(drop=True)], axis=1) | |
test_result = model.predict(test_input) | |
test_result = p_scaler.inverse_transform(np.array(test_result).reshape(-1, 1)) | |
test_result = test_result.flatten() | |
test_result_str = [f"Rp. {format(round(n), ',').replace(',', '.')}" for n in test_result] | |
test_df['deprecate_percentage'] = ((test_df['estimated_original_price'] - round(test_result[0])) / test_df['estimated_original_price']) | |
return f""" | |
Informasi penjualan: | |
- {car_names} | |
- Harga pembelian: {car_prices} | |
- Harga penjualan: {test_result_str[0]} | |
- Tahun penjualan: {car_sold} | |
- Depresiasi sebesar: {round(test_df['deprecate_percentage'].tolist()[0]*100)}% | |
""" |