File size: 3,386 Bytes
f35bff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
"""
Run Segmentation: K-means clustering, elbow method, silhouette score, PCA visualization
"""
import os
import sys
import glob
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def find_latest_data():
    data_files = glob.glob('data/processed/fred_data_*.csv')
    if not data_files:
        raise FileNotFoundError("No FRED data files found. Run the pipeline first.")
    return max(data_files, key=os.path.getctime)

def main():
    print("="*60)
    print("FRED Segmentation: K-means, Elbow, Silhouette, PCA Visualization")
    print("="*60)
    data_file = find_latest_data()
    print(f"Using data file: {data_file}")
    df = pd.read_csv(data_file, index_col=0, parse_dates=True)
    df_clean = df.dropna()
    if df_clean.shape[0] < 10 or df_clean.shape[1] < 2:
        print("Not enough data for clustering (need at least 10 rows and 2 columns after dropna). Skipping.")
        return
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_clean)
    # Elbow and silhouette
    inertias = []
    silhouette_scores = []
    k_range = range(2, min(11, len(df_clean)//10+1))
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(scaled_data)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(scaled_data, kmeans.labels_))
    # Plot elbow and silhouette
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(list(k_range), inertias, 'bo-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.grid(True)
    plt.subplot(1,2,2)
    plt.plot(list(k_range), silhouette_scores, 'ro-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('data/exports/clustering_analysis.png', dpi=200)
    plt.close()
    # Choose optimal k
    optimal_k = list(k_range)[np.argmax(silhouette_scores)]
    print(f"Optimal number of clusters: {optimal_k}")
    print(f"Best silhouette score: {max(silhouette_scores):.3f}")
    # Final clustering
    kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
    cluster_labels = kmeans_optimal.fit_predict(scaled_data)
    df_clustered = df_clean.copy()
    df_clustered['Cluster'] = cluster_labels
    # Cluster stats
    cluster_stats = df_clustered.groupby('Cluster').agg(['mean', 'std'])
    print("\nCluster Characteristics:")
    print(cluster_stats.round(3))
    # PCA visualization
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaled_data)
    plt.figure(figsize=(8,6))
    scatter = plt.scatter(pca_result[:,0], pca_result[:,1], c=cluster_labels, cmap='tab10', alpha=0.7)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Clusters Visualized with PCA')
    plt.legend(*scatter.legend_elements(), title="Cluster")
    plt.tight_layout()
    plt.savefig('data/exports/clusters_pca.png', dpi=200)
    plt.close()
    print("\nSegmentation complete. Outputs saved to data/exports/.")

if __name__ == "__main__":
    main()