1094_871_252_511 / 1094_871_252_511_.py
antitheft159's picture
Update 1094_871_252_511_.py
5223bd0 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import plotly.io as pio
pio.renderers.default = 'notebook'
pio.renderers.default = 'iframe_connected'
import pandas as pd
df=pd.read_csv("/content/unified_monthly_data_interpolated_1990_20250101 (1).csv")
df.info()
df.shape
df.isna().sum()
df.duplicated().sum()
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df.drop(columns=['Region'], inplace=True)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16, 12))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='Reds', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(pca_result)
fig = px.scatter(df, x='PCA1', y='PCA2', color=df['Cluster'].astype(str),
title='KMeans Clustering on PCA Features')
fig.show()
plt.figure(figsize=(14, 6))
for col in ['AverageSalesPrice', 'MedianSalesPriceofHousesSold', 'MedianListingPriceperSquareFeet']:
sns.lineplot(data=df[col], label=col)
plt.title('House Price Trends Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()
X = df.drop(columns=['MedianListingPriceperSquareFeet', 'Cluster', 'PCA1', 'PCA2'])
y = df['MedianListingPriceperSquareFeet']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)
print(f"Mean Squared Error: {mse:.3f}")
print(f"R2 Score: {r2:.3f}")
importances = model.feature_importances_
feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()