|
|
|
""" |
|
Run Segmentation: K-means clustering, elbow method, silhouette score, PCA visualization |
|
""" |
|
import os |
|
import sys |
|
import glob |
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src')) |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import KMeans |
|
from sklearn.metrics import silhouette_score |
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
def find_latest_data(): |
|
data_files = glob.glob('data/processed/fred_data_*.csv') |
|
if not data_files: |
|
raise FileNotFoundError("No FRED data files found. Run the pipeline first.") |
|
return max(data_files, key=os.path.getctime) |
|
|
|
def main(): |
|
print("="*60) |
|
print("FRED Segmentation: K-means, Elbow, Silhouette, PCA Visualization") |
|
print("="*60) |
|
data_file = find_latest_data() |
|
print(f"Using data file: {data_file}") |
|
df = pd.read_csv(data_file, index_col=0, parse_dates=True) |
|
df_clean = df.dropna() |
|
if df_clean.shape[0] < 10 or df_clean.shape[1] < 2: |
|
print("Not enough data for clustering (need at least 10 rows and 2 columns after dropna). Skipping.") |
|
return |
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(df_clean) |
|
|
|
inertias = [] |
|
silhouette_scores = [] |
|
k_range = range(2, min(11, len(df_clean)//10+1)) |
|
for k in k_range: |
|
kmeans = KMeans(n_clusters=k, random_state=42) |
|
kmeans.fit(scaled_data) |
|
inertias.append(kmeans.inertia_) |
|
silhouette_scores.append(silhouette_score(scaled_data, kmeans.labels_)) |
|
|
|
plt.figure(figsize=(12,4)) |
|
plt.subplot(1,2,1) |
|
plt.plot(list(k_range), inertias, 'bo-') |
|
plt.xlabel('Number of Clusters (k)') |
|
plt.ylabel('Inertia') |
|
plt.title('Elbow Method') |
|
plt.grid(True) |
|
plt.subplot(1,2,2) |
|
plt.plot(list(k_range), silhouette_scores, 'ro-') |
|
plt.xlabel('Number of Clusters (k)') |
|
plt.ylabel('Silhouette Score') |
|
plt.title('Silhouette Analysis') |
|
plt.grid(True) |
|
plt.tight_layout() |
|
plt.savefig('data/exports/clustering_analysis.png', dpi=200) |
|
plt.close() |
|
|
|
optimal_k = list(k_range)[np.argmax(silhouette_scores)] |
|
print(f"Optimal number of clusters: {optimal_k}") |
|
print(f"Best silhouette score: {max(silhouette_scores):.3f}") |
|
|
|
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42) |
|
cluster_labels = kmeans_optimal.fit_predict(scaled_data) |
|
df_clustered = df_clean.copy() |
|
df_clustered['Cluster'] = cluster_labels |
|
|
|
cluster_stats = df_clustered.groupby('Cluster').agg(['mean', 'std']) |
|
print("\nCluster Characteristics:") |
|
print(cluster_stats.round(3)) |
|
|
|
pca = PCA(n_components=2) |
|
pca_result = pca.fit_transform(scaled_data) |
|
plt.figure(figsize=(8,6)) |
|
scatter = plt.scatter(pca_result[:,0], pca_result[:,1], c=cluster_labels, cmap='tab10', alpha=0.7) |
|
plt.xlabel('PC1') |
|
plt.ylabel('PC2') |
|
plt.title('Clusters Visualized with PCA') |
|
plt.legend(*scatter.legend_elements(), title="Cluster") |
|
plt.tight_layout() |
|
plt.savefig('data/exports/clusters_pca.png', dpi=200) |
|
plt.close() |
|
print("\nSegmentation complete. Outputs saved to data/exports/.") |
|
|
|
if __name__ == "__main__": |
|
main() |