File size: 2,842 Bytes
f35bff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
"""
Run EDA: Distributions, skewness, kurtosis, correlations, PCA/t-SNE
"""
import os
import sys
import glob
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Find latest processed data file
def find_latest_data():
    data_files = glob.glob('data/processed/fred_data_*.csv')
    if not data_files:
        raise FileNotFoundError("No FRED data files found. Run the pipeline first.")
    return max(data_files, key=os.path.getctime)

def main():
    print("="*60)
    print("FRED EDA: Distributions, Skewness, Kurtosis, Correlations, PCA")
    print("="*60)
    data_file = find_latest_data()
    print(f"Using data file: {data_file}")
    df = pd.read_csv(data_file, index_col=0, parse_dates=True)
    df_clean = df.dropna()
    # 1. Distributions, Skewness, Kurtosis
    desc = df.describe()
    skew = df.skew()
    kurt = df.kurtosis()
    print("\nDescriptive Statistics:\n", desc)
    print("\nSkewness:")
    print(skew)
    print("\nKurtosis:")
    print(kurt)
    # Plot distributions
    for col in df.columns:
        plt.figure(figsize=(8,4))
        sns.histplot(df[col].dropna(), kde=True)
        plt.title(f"Distribution of {col}")
        plt.savefig(f"data/exports/distribution_{col}.png", dpi=200, bbox_inches='tight')
        plt.close()
    # 2. Correlation matrices
    pearson_corr = df.corr(method='pearson')
    spearman_corr = df.corr(method='spearman')
    print("\nPearson Correlation Matrix:\n", pearson_corr.round(3))
    print("\nSpearman Correlation Matrix:\n", spearman_corr.round(3))
    plt.figure(figsize=(8,6))
    sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Pearson Correlation Matrix')
    plt.tight_layout()
    plt.savefig('data/exports/pearson_corr_matrix.png', dpi=200)
    plt.close()
    plt.figure(figsize=(8,6))
    sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Spearman Correlation Matrix')
    plt.tight_layout()
    plt.savefig('data/exports/spearman_corr_matrix.png', dpi=200)
    plt.close()
    # 3. PCA for visualization
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df_clean)
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaled)
    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'], index=df_clean.index)
    plt.figure(figsize=(8,6))
    plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.5)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('PCA Projection (2D)')
    plt.tight_layout()
    plt.savefig('data/exports/pca_2d.png', dpi=200)
    plt.close()
    print("\nEDA complete. Outputs saved to data/exports/.")

if __name__ == "__main__":
    main()