Spaces:

IVSD
/

EasyMachineLearningDemo

Sleeping

EasyMachineLearningDemo / analysis /exploratory_analysis.py

LLH

2024/02/14/01:14

bd39f54 over 1 year ago

4.01 kB

	import numpy as np
	import sklearn.metrics
	from sklearn.cluster import KMeans
	from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
	from factor_analyzer.factor_analyzer import calculate_kmo

	from coding.llh.visualization.draw_heat_map import draw_heat_map
	from coding.llh.visualization.draw_scatter import draw_scatter_2D, draw_scatter_2D_1, draw_scatter_3D_1, draw_scatter_3D


	# K-means
	def k_means(array: np.ndarray):
	info = {}

	draw_scatter_2D_1(array, "2D scatter data before k-means")
	draw_scatter_3D_1(array, "3D scatter data before k-means")

	K = 60

	info["Number of clustering centers"] = K

	k_means_model = KMeans(n_clusters=K, init='k-means++')

	k_means_model.fit(array)

	sum_of_squared_errors = k_means_model.inertia_

	info["SSE"] = sum_of_squared_errors

	draw_scatter_2D(array, k_means_model.labels_, k_means_model.cluster_centers_, "2D scatter data after k-means")
	draw_scatter_3D(array, k_means_model.labels_, k_means_model.cluster_centers_, "3D scatter data after k-means")

	result = k_means_model.fit_predict(array[:200])

	silhouette_score = sklearn.metrics.silhouette_score(array[:200], result)

	info["Silhouette score"] = silhouette_score

	return info


	# Bartlett sphericity test
	def bartlett_test(df):
	_, p_value = calculate_bartlett_sphericity(df)

	return p_value


	# KMO test
	def kmo_test(df):
	_, kmo_score = calculate_kmo(df)

	return kmo_score


	# Principal component analysis
	def pca(df):
	# Only consider the correlation of the independent variables
	info = {}

	# array_x = df.iloc[:, 1:]
	array_x = df.iloc[:, :]
	array_y = df.iloc[:, :1]

	# Bartlett sphericity test
	p_value = bartlett_test(array_x)
	info["p value of bartlett sphericity test"] = p_value
	if p_value < 0.05:
	info["Result of bartlett sphericity test"] = "Accept"
	else:
	info["Result of bartlett sphericity test"] = "Reject"

	# KMO test
	kmo_score = kmo_test(array_x)
	info["Score of KMO test"] = kmo_score
	if kmo_score > 0.5:
	info["Result of KMO test"] = "Accept"
	else:
	info["Result of KMO test"] = "Reject"

	# get the matrix of correlation coefficients
	covX = np.around(np.corrcoef(array_x.T), decimals=3)

	# 计算协方差矩阵的对角线元素的标准差
	std_dev = np.sqrt(np.diag(covX))

	# 计算皮尔逊相关系数矩阵
	pearson_matrix = covX / np.outer(std_dev, std_dev)

	# draw_heat_map(pearson_matrix, "pearson matrix", True, df.columns.values)

	# Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
	eigenvalues, eigenvectors = np.linalg.eig(covX.T)

	eigenvalues = np.around(eigenvalues, decimals=3)

	eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))

	# Sort feature values in descending order
	eigenvalues = sorted(eigenvalues, reverse=True)

	for i, value in enumerate(eigenvalues):
	if i == 0:
	sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
	else:
	sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)

	# draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")

	# get the contribution of the eigenvalues
	contribution = eigenvalues / np.sum(eigenvalues)

	# get the cumulative contribution of the eigenvalues
	cumulative_contribution = np.cumsum(contribution)

	# Selection of principal components
	main_factors_index = [i for i in range(len(cumulative_contribution)) if cumulative_contribution[i] < 0.80]

	main_factor_num = len(main_factors_index)

	info["Main factor num"] = main_factor_num

	# Get the projection matrix
	projected_array = array_x.dot(sorted_eigenvectors[:, :main_factor_num])
	projected_array = np.concatenate((array_y.values, projected_array), axis=1)

	return projected_array, info