| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import plotly.io as pio | |
| pio.renderers.default = 'notebook' | |
| pio.renderers.default = 'iframe_connected' | |
| import pandas as pd | |
| df=pd.read_csv("/content/unified_monthly_data_interpolated_1990_20250101 (1).csv") | |
| df.info() | |
| df.shape | |
| df.isna().sum() | |
| df.duplicated().sum() | |
| df['Date'] = pd.to_datetime(df['Date']) | |
| df.set_index('Date', inplace=True) | |
| df.drop(columns=['Region'], inplace=True) | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| plt.figure(figsize=(16, 12)) | |
| corr = df.corr() | |
| sns.heatmap(corr, annot=True, fmt='.2f', cmap='Reds', linewidths=0.5) | |
| plt.title('Correlation Heatmap') | |
| plt.tight_layout() | |
| plt.show() | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import pandas as pd | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(df) | |
| pca = PCA(n_components=2) | |
| pca_result = pca.fit_transform(scaled_data) | |
| df['PCA1'] = pca_result[:, 0] | |
| df['PCA2'] = pca_result[:, 1] | |
| kmeans = KMeans(n_clusters=3, random_state=42) | |
| df['Cluster'] = kmeans.fit_predict(pca_result) | |
| fig = px.scatter(df, x='PCA1', y='PCA2', color=df['Cluster'].astype(str), | |
| title='KMeans Clustering on PCA Features') | |
| fig.show() | |
| plt.figure(figsize=(14, 6)) | |
| for col in ['AverageSalesPrice', 'MedianSalesPriceofHousesSold', 'MedianListingPriceperSquareFeet']: | |
| sns.lineplot(data=df[col], label=col) | |
| plt.title('House Price Trends Over Time') | |
| plt.xlabel('Date') | |
| plt.ylabel('Price') | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.show() | |
| X = df.drop(columns=['MedianListingPriceperSquareFeet', 'Cluster', 'PCA1', 'PCA2']) | |
| y = df['MedianListingPriceperSquareFeet'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| model = RandomForestRegressor(n_estimators=100, random_state=42) | |
| model.fit(X_train, y_train) | |
| preds = model.predict(X_test) | |
| mse = mean_squared_error(y_test, preds) | |
| r2 = r2_score(y_test, preds) | |
| print(f"Mean Squared Error: {mse:.3f}") | |
| print(f"R2 Score: {r2:.3f}") | |
| importances = model.feature_importances_ | |
| feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False) | |
| plt.figure(figsize=(10, 6)) | |
| sns.barplot(data=feat_df, x='Importance', y='Feature', palette='viridis') | |
| plt.title('Feature Importance') | |
| plt.tight_layout() | |
| plt.show() |