import numpy as np import sklearn.metrics from sklearn.cluster import KMeans from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity from factor_analyzer.factor_analyzer import calculate_kmo from coding.llh.visualization.draw_heat_map import draw_heat_map from coding.llh.visualization.draw_scatter import draw_scatter_2D, draw_scatter_2D_1, draw_scatter_3D_1, draw_scatter_3D # K-means def k_means(array: np.ndarray): info = {} draw_scatter_2D_1(array, "2D scatter data before k-means") draw_scatter_3D_1(array, "3D scatter data before k-means") K = 60 info["Number of clustering centers"] = K k_means_model = KMeans(n_clusters=K, init='k-means++') k_means_model.fit(array) sum_of_squared_errors = k_means_model.inertia_ info["SSE"] = sum_of_squared_errors draw_scatter_2D(array, k_means_model.labels_, k_means_model.cluster_centers_, "2D scatter data after k-means") draw_scatter_3D(array, k_means_model.labels_, k_means_model.cluster_centers_, "3D scatter data after k-means") result = k_means_model.fit_predict(array[:200]) silhouette_score = sklearn.metrics.silhouette_score(array[:200], result) info["Silhouette score"] = silhouette_score return info # Bartlett sphericity test def bartlett_test(df): _, p_value = calculate_bartlett_sphericity(df) return p_value # KMO test def kmo_test(df): _, kmo_score = calculate_kmo(df) return kmo_score # Principal component analysis def pca(df): # Only consider the correlation of the independent variables info = {} # array_x = df.iloc[:, 1:] array_x = df.iloc[:, :] array_y = df.iloc[:, :1] # Bartlett sphericity test p_value = bartlett_test(array_x) info["p value of bartlett sphericity test"] = p_value if p_value < 0.05: info["Result of bartlett sphericity test"] = "Accept" else: info["Result of bartlett sphericity test"] = "Reject" # KMO test kmo_score = kmo_test(array_x) info["Score of KMO test"] = kmo_score if kmo_score > 0.5: info["Result of KMO test"] = "Accept" else: info["Result of KMO test"] = "Reject" # get the matrix of correlation coefficients covX = np.around(np.corrcoef(array_x.T), decimals=3) # 计算协方差矩阵的对角线元素的标准差 std_dev = np.sqrt(np.diag(covX)) # 计算皮尔逊相关系数矩阵 pearson_matrix = covX / np.outer(std_dev, std_dev) # draw_heat_map(pearson_matrix, "pearson matrix", True, df.columns.values) # Solve the eigenvalues and eigenvectors of the coefficient correlation matrix eigenvalues, eigenvectors = np.linalg.eig(covX.T) eigenvalues = np.around(eigenvalues, decimals=3) eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues))))) # Sort feature values in descending order eigenvalues = sorted(eigenvalues, reverse=True) for i, value in enumerate(eigenvalues): if i == 0: sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1) else: sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1) # draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue") # get the contribution of the eigenvalues contribution = eigenvalues / np.sum(eigenvalues) # get the cumulative contribution of the eigenvalues cumulative_contribution = np.cumsum(contribution) # Selection of principal components main_factors_index = [i for i in range(len(cumulative_contribution)) if cumulative_contribution[i] < 0.80] main_factor_num = len(main_factors_index) info["Main factor num"] = main_factor_num # Get the projection matrix projected_array = array_x.dot(sorted_eigenvectors[:, :main_factor_num]) projected_array = np.concatenate((array_y.values, projected_array), axis=1) return projected_array, info