Spaces:
Sleeping
Sleeping
LLH
commited on
Commit
·
3cfb9a3
1
Parent(s):
0c19671
2024/03/09/11:50
Browse files- analysis/bayes_model.py +0 -82
- analysis/descriptive_analysis.py +0 -303
- analysis/distance_model.py +0 -115
- analysis/evaluation_model.py +0 -99
- analysis/exploratory_analysis.py +0 -130
- analysis/gaussian_model.py +0 -28
- analysis/gradient_model.py +0 -65
- analysis/kernel_model.py +0 -119
- analysis/linear_model.py +0 -217
- analysis/markov_model.py +0 -98
- analysis/model_train/tree_model.py +0 -9
- analysis/my_learning_curve.py +0 -33
- analysis/neural_model.py +0 -321
- analysis/others/evaluation_model.py +0 -99
- analysis/others/gaussian_model.py +0 -28
- analysis/others/markov_model.py +0 -98
- analysis/others/poly_model.py +0 -12
- analysis/poly_model.py +0 -12
- analysis/shap_model.py +0 -55
- analysis/tree_model.py +0 -290
- analysis/two_exponential_smoothing_model.py +0 -48
- functions/process.py +14 -8
- static/__init__.py +0 -0
- static/col.py +0 -68
- static/config.py +0 -136
- static/new_class.py +0 -195
- static/paint.py +0 -51
- static/process.py +0 -326
analysis/bayes_model.py
DELETED
@@ -1,82 +0,0 @@
|
|
1 |
-
from sklearn.model_selection import learning_curve
|
2 |
-
from sklearn.naive_bayes import *
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
from static.new_class import Container
|
6 |
-
from static.process import grid_search, bayes_search
|
7 |
-
from visualization.draw_line_graph import draw_line_graph
|
8 |
-
from visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
9 |
-
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
10 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
11 |
-
|
12 |
-
|
13 |
-
class NaiveBayesClassifierParams:
|
14 |
-
@classmethod
|
15 |
-
def get_params(cls, sort):
|
16 |
-
if sort == "MultinomialNB":
|
17 |
-
return {
|
18 |
-
"alpha": [0.1, 0.5, 1.0, 2.0]
|
19 |
-
}
|
20 |
-
elif sort == "GaussianNB":
|
21 |
-
return {}
|
22 |
-
elif sort == "ComplementNB":
|
23 |
-
return {
|
24 |
-
"alpha": [0.1, 0.5, 1, 10],
|
25 |
-
"fit_prior": [True, False],
|
26 |
-
"norm": [True, False]
|
27 |
-
}
|
28 |
-
|
29 |
-
|
30 |
-
# 朴素贝叶斯分类
|
31 |
-
def naive_bayes_classification(container: Container, model=None):
|
32 |
-
x_train = container.x_train
|
33 |
-
y_train = container.y_train
|
34 |
-
x_test = container.x_test
|
35 |
-
y_test = container.y_test
|
36 |
-
hyper_params_optimize = container.hyper_params_optimize
|
37 |
-
info = {}
|
38 |
-
|
39 |
-
if model == "MultinomialNB":
|
40 |
-
naive_bayes_model = MultinomialNB()
|
41 |
-
params = NaiveBayesClassifierParams.get_params(model)
|
42 |
-
elif model == "GaussianNB":
|
43 |
-
naive_bayes_model = GaussianNB()
|
44 |
-
params = NaiveBayesClassifierParams.get_params(model)
|
45 |
-
elif model == "ComplementNB":
|
46 |
-
naive_bayes_model = ComplementNB()
|
47 |
-
params = NaiveBayesClassifierParams.get_params(model)
|
48 |
-
else:
|
49 |
-
naive_bayes_model = GaussianNB()
|
50 |
-
params = NaiveBayesClassifierParams.get_params(model)
|
51 |
-
|
52 |
-
if hyper_params_optimize == "grid_search":
|
53 |
-
best_model = grid_search(params, naive_bayes_model, x_train, y_train)
|
54 |
-
elif hyper_params_optimize == "bayes_search":
|
55 |
-
best_model = bayes_search(params, naive_bayes_model, x_train, y_train)
|
56 |
-
else:
|
57 |
-
best_model = naive_bayes_model
|
58 |
-
best_model.fit(x_train, y_train)
|
59 |
-
|
60 |
-
info["参数"] = best_model.get_params()
|
61 |
-
|
62 |
-
y_pred = best_model.predict(x_test)
|
63 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
64 |
-
container.set_y_pred(y_pred)
|
65 |
-
|
66 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
67 |
-
|
68 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
69 |
-
train_scores_std = np.std(train_scores, axis=1)
|
70 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
71 |
-
test_scores_std = np.std(test_scores, axis=1)
|
72 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
73 |
-
test_scores_std)
|
74 |
-
|
75 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
76 |
-
|
77 |
-
container.set_info(info)
|
78 |
-
container.set_status("trained")
|
79 |
-
container.set_model(best_model)
|
80 |
-
|
81 |
-
return container
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/descriptive_analysis.py
DELETED
@@ -1,303 +0,0 @@
|
|
1 |
-
|
2 |
-
from datetime import datetime
|
3 |
-
|
4 |
-
import json
|
5 |
-
import sys
|
6 |
-
import numpy as np
|
7 |
-
import pandas as pd
|
8 |
-
import math
|
9 |
-
import time as sys_time
|
10 |
-
|
11 |
-
from coding.llh.visualization.draw_boxplot import draw_boxplot
|
12 |
-
from coding.llh.visualization.draw_heat_map import draw_heat_map
|
13 |
-
from coding.llh.visualization.draw_histogram import draw_histogram
|
14 |
-
from coding.llh.visualization.draw_histogram_line_subgraph import draw_histogram_line_subgraph
|
15 |
-
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
16 |
-
from tqdm import tqdm
|
17 |
-
|
18 |
-
|
19 |
-
# 0202:
|
20 |
-
def data_transformation_extra(df: pd.DataFrame, str2int_mappings: dict) -> (pd.DataFrame):
|
21 |
-
|
22 |
-
# Delete "match_id" column
|
23 |
-
# df.drop("match_id", axis=1, inplace=True)
|
24 |
-
df["match_id"] = df["match_id"].apply(lambda x: x[-4:])
|
25 |
-
|
26 |
-
# Dissolve the two-mode data mapping into two part
|
27 |
-
|
28 |
-
value_to_replace_dict = {
|
29 |
-
"AD": "50"
|
30 |
-
}
|
31 |
-
|
32 |
-
value_to_replace = "AD"
|
33 |
-
df["p1_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
|
34 |
-
df["p2_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
|
35 |
-
|
36 |
-
str2int_mappings_to_dissolve = {
|
37 |
-
"p1_score": {"0": 0},
|
38 |
-
"p2_score": {"0": 0}
|
39 |
-
}
|
40 |
-
|
41 |
-
df["p1_score_mark"] = 0
|
42 |
-
df["p2_score_mark"] = 0
|
43 |
-
|
44 |
-
for key in str2int_mappings_to_dissolve.keys():
|
45 |
-
for i in range(1, len(df)):
|
46 |
-
if df.loc[i, key] == "15" and df.loc[i-1, key] == "0":
|
47 |
-
df.loc[i, key+"_mark"] = 1
|
48 |
-
elif df.loc[i, key] == "1" and df.loc[i-1, key] == "0":
|
49 |
-
df.loc[i, key + "_mark"] = 2
|
50 |
-
|
51 |
-
df["p1_score_normal"] = 0
|
52 |
-
df["p1_score_tiebreak"] = 0
|
53 |
-
df["p2_score_normal"] = 0
|
54 |
-
df["p2_score_tiebreak"] = 0
|
55 |
-
|
56 |
-
normal_counter = 0
|
57 |
-
tiebreak_counter = 0
|
58 |
-
for key in str2int_mappings_to_dissolve.keys():
|
59 |
-
for i in range(0, len(df)):
|
60 |
-
if df.loc[i, key] == "0":
|
61 |
-
normal_counter = 0
|
62 |
-
tiebreak_counter = 0
|
63 |
-
continue
|
64 |
-
|
65 |
-
if df.loc[i, key+"_mark"] == 1 or normal_counter > 0:
|
66 |
-
if int(df.loc[i, key]) > int(df.loc[i-1, key]):
|
67 |
-
normal_counter += 1
|
68 |
-
df.loc[i, key + "_normal"] = normal_counter
|
69 |
-
if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
|
70 |
-
str2int_mappings_to_dissolve[key][value_to_replace] = normal_counter
|
71 |
-
else:
|
72 |
-
str2int_mappings_to_dissolve[key][df.loc[i, key]] = normal_counter
|
73 |
-
|
74 |
-
elif int(df.loc[i, key]) < int(df.loc[i-1, key]):
|
75 |
-
normal_counter -= 1
|
76 |
-
df.loc[i, key + "_normal"] = normal_counter
|
77 |
-
|
78 |
-
else:
|
79 |
-
df.loc[i, key + "_normal"] = normal_counter
|
80 |
-
|
81 |
-
elif df.loc[i, key+"_mark"] == 2 or tiebreak_counter > 0:
|
82 |
-
if int(df.loc[i, key]) > int(df.loc[i - 1, key]):
|
83 |
-
tiebreak_counter += 1
|
84 |
-
df.loc[i, key+"_tiebreak"] = tiebreak_counter
|
85 |
-
if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
|
86 |
-
str2int_mappings_to_dissolve[key][value_to_replace] = tiebreak_counter
|
87 |
-
else:
|
88 |
-
str2int_mappings_to_dissolve[key][df.loc[i, key]] = tiebreak_counter
|
89 |
-
|
90 |
-
elif int(df.loc[i, key]) < int(df.loc[i - 1, key]):
|
91 |
-
tiebreak_counter -= 1
|
92 |
-
df.loc[i, key+"_tiebreak"] = tiebreak_counter
|
93 |
-
|
94 |
-
else:
|
95 |
-
df.loc[i, key + "_tiebreak"] = tiebreak_counter
|
96 |
-
|
97 |
-
str2int_mappings.update(str2int_mappings_to_dissolve)
|
98 |
-
|
99 |
-
df.drop("p1_score_mark", axis=1, inplace=True)
|
100 |
-
df.drop("p2_score_mark", axis=1, inplace=True)
|
101 |
-
df.drop("p1_score", axis=1, inplace=True)
|
102 |
-
df.drop("p2_score", axis=1, inplace=True)
|
103 |
-
|
104 |
-
# Transform "elapsed_time" time column
|
105 |
-
|
106 |
-
def transform_time_col(time: str):
|
107 |
-
h, m, s = time.strip().split(":")
|
108 |
-
seconds = int(h) * 3600 + int(m) * 60 + int(s)
|
109 |
-
return seconds
|
110 |
-
|
111 |
-
df["elapsed_time"] = df["elapsed_time"].apply(transform_time_col)
|
112 |
-
|
113 |
-
# Calculate "game_victor", "set_victor" column cumulative value
|
114 |
-
|
115 |
-
df["p1_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 1 else 0, axis=1)
|
116 |
-
df["p2_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 2 else 0, axis=1)
|
117 |
-
df["p1_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 1 else 0, axis=1)
|
118 |
-
df["p2_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 2 else 0, axis=1)
|
119 |
-
|
120 |
-
df["p1_game_victor"] = df.groupby(["player1", "player2"])["p1_game_victor"].cumsum()
|
121 |
-
df["p2_game_victor"] = df.groupby(["player1", "player2"])["p2_game_victor"].cumsum()
|
122 |
-
df["p1_set_victor"] = df.groupby(["player1", "player2"])["p1_set_victor"].cumsum()
|
123 |
-
df["p2_set_victor"] = df.groupby(["player1", "player2"])["p2_set_victor"].cumsum()
|
124 |
-
|
125 |
-
# Forced conversion of data types
|
126 |
-
for col in df.columns.values:
|
127 |
-
df[col] = df[col].astype("float")
|
128 |
-
|
129 |
-
# Save the mappings to a json format file
|
130 |
-
with open("./data/mappings.json", "w", encoding="utf-8") as f:
|
131 |
-
json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
|
132 |
-
|
133 |
-
return df
|
134 |
-
|
135 |
-
|
136 |
-
def data_transformation(df: pd.DataFrame) -> (pd.DataFrame, dict):
|
137 |
-
"""
|
138 |
-
0.
|
139 |
-
1. Define mappings
|
140 |
-
2. Create mappings
|
141 |
-
3. Modify the original data according to the mappings
|
142 |
-
4. Get type exception
|
143 |
-
5. Forced conversion of data types
|
144 |
-
"""
|
145 |
-
|
146 |
-
info = {}
|
147 |
-
|
148 |
-
# Define mappings
|
149 |
-
str2int_mappings = {
|
150 |
-
"player1": {},
|
151 |
-
"player2": {},
|
152 |
-
"winner_shot_type": {},
|
153 |
-
"serve_width": {},
|
154 |
-
"serve_depth": {},
|
155 |
-
"return_depth": {}
|
156 |
-
}
|
157 |
-
|
158 |
-
# Create mappings
|
159 |
-
for col in str2int_mappings.copy():
|
160 |
-
keys = np.array(df[col].drop_duplicates())
|
161 |
-
values = [x for x in range(len(keys))]
|
162 |
-
str2int_mappings[col] = dict(zip(keys, values))
|
163 |
-
|
164 |
-
# Modify the original data according to the mappings
|
165 |
-
for col, mapping in str2int_mappings.items():
|
166 |
-
series = df[col]
|
167 |
-
|
168 |
-
for k, v in mapping.items():
|
169 |
-
series.replace(k, v, inplace=True)
|
170 |
-
df[col] = series
|
171 |
-
|
172 |
-
df.replace('Not A Number', 0, inplace=True)
|
173 |
-
|
174 |
-
# Get type exception
|
175 |
-
|
176 |
-
# abnormal_type_values = []
|
177 |
-
#
|
178 |
-
# for col in df.columns.values:
|
179 |
-
# if col not in str2int_mappings.keys():
|
180 |
-
# for row in df[col]:
|
181 |
-
|
182 |
-
# if not (0 <= row <= sys.maxsize):
|
183 |
-
# abnormal_type_values.append(row)
|
184 |
-
#
|
185 |
-
# info["Number of abnormal type value"] = sorted(abnormal_type_values)
|
186 |
-
|
187 |
-
|
188 |
-
# # Forced conversion of data types
|
189 |
-
# for col in df.columns.values:
|
190 |
-
# df[col] = df[col].astype("float")
|
191 |
-
#
|
192 |
-
# # Save the mappings to a json format file
|
193 |
-
# with open("./mappings.json", "w", encoding="utf-8") as f:
|
194 |
-
# json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
|
195 |
-
|
196 |
-
|
197 |
-
# 0202:
|
198 |
-
df = data_transformation_extra(df, str2int_mappings)
|
199 |
-
|
200 |
-
return df, info
|
201 |
-
|
202 |
-
|
203 |
-
# Get descriptive indicators and filtered data based on boxplpot
|
204 |
-
def get_descriptive_indicators_related(df):
|
205 |
-
info = {}
|
206 |
-
|
207 |
-
descriptive_indicators_df = pd.DataFrame(
|
208 |
-
index=list(df.columns.values),
|
209 |
-
columns=[
|
210 |
-
"Min",
|
211 |
-
"Max",
|
212 |
-
"Avg",
|
213 |
-
"Standard Deviation",
|
214 |
-
"Standard Error",
|
215 |
-
"Upper Quartile",
|
216 |
-
"Median",
|
217 |
-
"Lower Quartile",
|
218 |
-
"Interquartile Distance",
|
219 |
-
"Kurtosis",
|
220 |
-
"Skewness",
|
221 |
-
"Coefficient of Variation"
|
222 |
-
]
|
223 |
-
)
|
224 |
-
|
225 |
-
for col in df.columns.values:
|
226 |
-
descriptive_indicators_df["Min"][col] = df[col].min()
|
227 |
-
descriptive_indicators_df["Max"][col] = df[col].max()
|
228 |
-
descriptive_indicators_df["Avg"][col] = df[col].mean()
|
229 |
-
descriptive_indicators_df["Standard Deviation"][col] = df[col].std()
|
230 |
-
descriptive_indicators_df["Standard Error"][col] = descriptive_indicators_df["Standard Deviation"][col] / \
|
231 |
-
math.sqrt(len(df[col]))
|
232 |
-
descriptive_indicators_df["Upper Quartile"][col] = df[col].quantile(0.75)
|
233 |
-
descriptive_indicators_df["Median"][col] = df[col].quantile(0.5)
|
234 |
-
descriptive_indicators_df["Lower Quartile"][col] = df[col].quantile(0.25)
|
235 |
-
descriptive_indicators_df["Interquartile Distance"][col] = descriptive_indicators_df["Lower Quartile"][col] - \
|
236 |
-
descriptive_indicators_df["Upper Quartile"][col]
|
237 |
-
descriptive_indicators_df["Kurtosis"][col] = df[col].kurt()
|
238 |
-
descriptive_indicators_df["Skewness"][col] = df[col].skew()
|
239 |
-
descriptive_indicators_df["Coefficient of Variation"][col] = descriptive_indicators_df["Standard Deviation"][col] \
|
240 |
-
/ descriptive_indicators_df["Avg"][col]
|
241 |
-
|
242 |
-
# draw_heat_map(descriptive_indicators_df.to_numpy(), "descriptive indicators", True)
|
243 |
-
#
|
244 |
-
# draw_boxplot(df, "descriptive indicators boxplot")
|
245 |
-
|
246 |
-
len_0 = len(df)
|
247 |
-
|
248 |
-
# tmp_df = \
|
249 |
-
# df[(df >= (descriptive_indicators_df["Lower Quartile"] - 1.5 * (descriptive_indicators_df["Upper Quartile"] -
|
250 |
-
# descriptive_indicators_df["Lower Quartile"])))
|
251 |
-
# & (df <= (descriptive_indicators_df["Upper Quartile"] + 1.5 * (descriptive_indicators_df["Upper Quartile"] -
|
252 |
-
# descriptive_indicators_df["Lower Quartile"])))][[
|
253 |
-
# "ProductChoice", "MembershipPoints", "ModeOfPayment", "ResidentCity", "PurchaseTenure", "IncomeClass",
|
254 |
-
# "CustomerPropensity", "CustomerAge", "LastPurchaseDuration"
|
255 |
-
# ]]
|
256 |
-
|
257 |
-
# tmp_df.dropna(inplace=True)
|
258 |
-
|
259 |
-
# df = pd.concat([tmp_df, df[["ProductChoice", "Channel", "MartialStatus"]]], axis=1, join="inner")
|
260 |
-
|
261 |
-
# df = pd.concat([df.iloc[:, :9], df.iloc[:, 10:]], axis=1)
|
262 |
-
|
263 |
-
# info["Number of offsetting value"] = len_0 - len(df)
|
264 |
-
#
|
265 |
-
# info["Total size of filtered data after descriptive analysis"] = len(df)
|
266 |
-
|
267 |
-
return df, info
|
268 |
-
|
269 |
-
|
270 |
-
# Create images of the distribution of the number of each variable
|
271 |
-
def variable_distribution(df):
|
272 |
-
counts_mappings = {}
|
273 |
-
print("counts analysis")
|
274 |
-
for col in tqdm(df.columns.values, desc='columns:'):
|
275 |
-
counts_mapping = {}
|
276 |
-
for x in tqdm(df[col], desc='cells'):
|
277 |
-
if x in counts_mapping.keys():
|
278 |
-
counts_mapping[x] += 1
|
279 |
-
else:
|
280 |
-
counts_mapping[x] = 1
|
281 |
-
counts_mappings[col] = counts_mapping
|
282 |
-
|
283 |
-
total_data_for_plot = []
|
284 |
-
print("plotting")
|
285 |
-
for col, mapping in tqdm(counts_mappings.items(), desc='columns'):
|
286 |
-
if col in ["set_no", 'game_no']:
|
287 |
-
sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[0])
|
288 |
-
data = [x[1] for x in sorting]
|
289 |
-
labels = [x[0] for x in sorting]
|
290 |
-
|
291 |
-
total_data_for_plot.append(["line_graph", labels, data, col])
|
292 |
-
draw_line_graph(labels, data, col)
|
293 |
-
else:
|
294 |
-
sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[1])
|
295 |
-
data = [x[1] for x in sorting]
|
296 |
-
labels = [x[0] for x in sorting]
|
297 |
-
|
298 |
-
will_rotate = True if col in ["player1","player2", "match_id"] else False
|
299 |
-
will_show_text = False if col in ["ResidentCity"] else True
|
300 |
-
|
301 |
-
total_data_for_plot.append(["histogram", data, labels, will_rotate, will_show_text, col])
|
302 |
-
draw_histogram(data, labels, will_rotate, will_show_text, col)
|
303 |
-
# draw_histogram_line_subgraph(total_data_for_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/distance_model.py
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
from sklearn.model_selection import learning_curve
|
2 |
-
|
3 |
-
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
4 |
-
from analysis.shap_model import *
|
5 |
-
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
6 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
7 |
-
from static.new_class import *
|
8 |
-
from static.process import grid_search, bayes_search
|
9 |
-
|
10 |
-
|
11 |
-
class KNNClassifierParams:
|
12 |
-
@classmethod
|
13 |
-
def get_params(cls):
|
14 |
-
return {
|
15 |
-
"n_neighbors": [3, 5, 7, 9],
|
16 |
-
"weights": ['uniform', 'distance'],
|
17 |
-
"p": [1, 2]
|
18 |
-
}
|
19 |
-
|
20 |
-
|
21 |
-
# KNN分类
|
22 |
-
def knn_classifier(container: Container):
|
23 |
-
x_train = container.x_train
|
24 |
-
y_train = container.y_train
|
25 |
-
x_test = container.x_test
|
26 |
-
y_test = container.y_test
|
27 |
-
hyper_params_optimize = container.hyper_params_optimize
|
28 |
-
info = {}
|
29 |
-
|
30 |
-
knn_classifier_model = KNeighborsClassifier()
|
31 |
-
params = KNNClassifierParams.get_params()
|
32 |
-
|
33 |
-
if hyper_params_optimize == "grid_search":
|
34 |
-
best_model = grid_search(params, knn_classifier_model, x_train, y_train)
|
35 |
-
elif hyper_params_optimize == "bayes_search":
|
36 |
-
best_model = bayes_search(params, knn_classifier_model, x_train, y_train)
|
37 |
-
else:
|
38 |
-
best_model = knn_classifier_model
|
39 |
-
best_model.fit(x_train, y_train)
|
40 |
-
|
41 |
-
info["参数"] = best_model.get_params()
|
42 |
-
|
43 |
-
y_pred = best_model.predict(x_test)
|
44 |
-
container.set_y_pred(y_pred)
|
45 |
-
|
46 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
47 |
-
|
48 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
49 |
-
train_scores_std = np.std(train_scores, axis=1)
|
50 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
51 |
-
test_scores_std = np.std(test_scores, axis=1)
|
52 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
53 |
-
test_scores_std)
|
54 |
-
|
55 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
56 |
-
|
57 |
-
container.set_info(info)
|
58 |
-
container.set_status("trained")
|
59 |
-
container.set_model(best_model)
|
60 |
-
|
61 |
-
return container
|
62 |
-
|
63 |
-
|
64 |
-
class KNNRegressionParams:
|
65 |
-
@classmethod
|
66 |
-
def get_params(cls):
|
67 |
-
return {
|
68 |
-
"n_neighbors": [3, 5, 7, 9],
|
69 |
-
"weights": ['uniform', 'distance'],
|
70 |
-
"p": [1, 2]
|
71 |
-
}
|
72 |
-
|
73 |
-
|
74 |
-
# KNN回归
|
75 |
-
def knn_regression(container: Container):
|
76 |
-
x_train = container.x_train
|
77 |
-
y_train = container.y_train
|
78 |
-
x_test = container.x_test
|
79 |
-
y_test = container.y_test
|
80 |
-
hyper_params_optimize = container.hyper_params_optimize
|
81 |
-
info = {}
|
82 |
-
|
83 |
-
knn_regression_model = KNeighborsRegressor()
|
84 |
-
params = KNNRegressionParams.get_params()
|
85 |
-
|
86 |
-
if hyper_params_optimize == "grid_search":
|
87 |
-
best_model = grid_search(params, knn_regression_model, x_train, y_train)
|
88 |
-
elif hyper_params_optimize == "bayes_search":
|
89 |
-
best_model = bayes_search(params, knn_regression_model, x_train, y_train)
|
90 |
-
else:
|
91 |
-
best_model = knn_regression_model
|
92 |
-
best_model.fit(x_train, y_train)
|
93 |
-
|
94 |
-
info["参数"] = best_model.get_params()
|
95 |
-
|
96 |
-
y_pred = best_model.predict(x_test)
|
97 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
98 |
-
container.set_y_pred(y_pred)
|
99 |
-
|
100 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
101 |
-
|
102 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
103 |
-
train_scores_std = np.std(train_scores, axis=1)
|
104 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
105 |
-
test_scores_std = np.std(test_scores, axis=1)
|
106 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
107 |
-
test_scores_std)
|
108 |
-
|
109 |
-
info["指标"] = calculate_regression_metrics(y_pred, y_test)
|
110 |
-
|
111 |
-
container.set_info(info)
|
112 |
-
container.set_status("trained")
|
113 |
-
container.set_model(best_model)
|
114 |
-
|
115 |
-
return container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/evaluation_model.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import skfuzzy as fuzz
|
3 |
-
from skfuzzy import control as ctrl
|
4 |
-
import matplotlib.pyplot as plt
|
5 |
-
|
6 |
-
|
7 |
-
def fuzzy_comprehensive_evaluation_model():
|
8 |
-
# 创建模糊变量和模糊集合
|
9 |
-
technical_skill = ctrl.Antecedent(np.arange(0, 101, 1), 'technical_skill')
|
10 |
-
physical_condition = ctrl.Antecedent(np.arange(0, 101, 1), 'physical_condition')
|
11 |
-
mental_toughness = ctrl.Antecedent(np.arange(0, 101, 1), 'mental_toughness')
|
12 |
-
opponent_strength = ctrl.Antecedent(np.arange(0, 101, 1), 'opponent_strength')
|
13 |
-
|
14 |
-
performance = ctrl.Consequent(np.arange(0, 101, 1), 'performance')
|
15 |
-
|
16 |
-
# 设定模糊隶属度函数
|
17 |
-
technical_skill['low'] = fuzz.trimf(technical_skill.universe, [0, 0, 50])
|
18 |
-
technical_skill['medium'] = fuzz.trimf(technical_skill.universe, [0, 50, 100])
|
19 |
-
technical_skill['high'] = fuzz.trimf(technical_skill.universe, [50, 100, 100])
|
20 |
-
|
21 |
-
physical_condition['low'] = fuzz.trimf(physical_condition.universe, [0, 0, 50])
|
22 |
-
physical_condition['medium'] = fuzz.trimf(physical_condition.universe, [0, 50, 100])
|
23 |
-
physical_condition['high'] = fuzz.trimf(physical_condition.universe, [50, 100, 100])
|
24 |
-
|
25 |
-
mental_toughness['low'] = fuzz.trimf(mental_toughness.universe, [0, 0, 50])
|
26 |
-
mental_toughness['medium'] = fuzz.trimf(mental_toughness.universe, [0, 50, 100])
|
27 |
-
mental_toughness['high'] = fuzz.trimf(mental_toughness.universe, [50, 100, 100])
|
28 |
-
|
29 |
-
opponent_strength['low'] = fuzz.trimf(opponent_strength.universe, [0, 0, 50])
|
30 |
-
opponent_strength['medium'] = fuzz.trimf(opponent_strength.universe, [0, 50, 100])
|
31 |
-
opponent_strength['high'] = fuzz.trimf(opponent_strength.universe, [50, 100, 100])
|
32 |
-
|
33 |
-
performance['poor'] = fuzz.trimf(performance.universe, [0, 0, 50])
|
34 |
-
performance['average'] = fuzz.trimf(performance.universe, [0, 50, 100])
|
35 |
-
performance['excellent'] = fuzz.trimf(performance.universe, [50, 100, 100])
|
36 |
-
|
37 |
-
# 设定输出的解模糊方法——质心解模糊方式
|
38 |
-
performance.defuzzify_method = 'centroid'
|
39 |
-
|
40 |
-
# 设定规则
|
41 |
-
rule1 = ctrl.Rule(
|
42 |
-
technical_skill['low'] | physical_condition['low'] | mental_toughness['low'] | opponent_strength['low'],
|
43 |
-
performance['poor']
|
44 |
-
)
|
45 |
-
rule2 = ctrl.Rule(
|
46 |
-
technical_skill['medium'] | physical_condition['medium'] | mental_toughness['medium'] | opponent_strength['medium'],
|
47 |
-
performance['average']
|
48 |
-
)
|
49 |
-
rule3 = ctrl.Rule(
|
50 |
-
technical_skill['high'] | physical_condition['high'] | mental_toughness['high'] | opponent_strength['high'],
|
51 |
-
performance['excellent']
|
52 |
-
)
|
53 |
-
|
54 |
-
# 创建控制系统
|
55 |
-
performance_evaluation = ctrl.ControlSystem([rule1, rule2, rule3])
|
56 |
-
performance_evaluator = ctrl.ControlSystemSimulation(performance_evaluation)
|
57 |
-
|
58 |
-
# 输入数据
|
59 |
-
performance_evaluator.input['technical_skill'] = 75
|
60 |
-
performance_evaluator.input['physical_condition'] = 80
|
61 |
-
performance_evaluator.input['mental_toughness'] = 85
|
62 |
-
performance_evaluator.input['opponent_strength'] = 60
|
63 |
-
|
64 |
-
# 计算模糊综合评分
|
65 |
-
performance_evaluator.compute()
|
66 |
-
|
67 |
-
# 输出结果
|
68 |
-
print("模糊综合评分:", performance_evaluator.output['performance'])
|
69 |
-
|
70 |
-
# 打印模糊集合的可视化图表
|
71 |
-
technical_skill.view("technical_skill", sim=performance_evaluator)
|
72 |
-
physical_condition.view("physical_condition", sim=performance_evaluator)
|
73 |
-
mental_toughness.view("mental_toughness", sim=performance_evaluator)
|
74 |
-
opponent_strength.view("opponent_strength", sim=performance_evaluator)
|
75 |
-
performance.view("performance", sim=performance_evaluator)
|
76 |
-
|
77 |
-
# Perform sensitivity analyze (to change input value)
|
78 |
-
|
79 |
-
# input_var_1:
|
80 |
-
|
81 |
-
# input_values = np.arange(0, 11, 1)
|
82 |
-
# output_values = []
|
83 |
-
#
|
84 |
-
# for val in input_values:
|
85 |
-
# fuzzy_control_sys_simulation.input["input_var_1"] = val
|
86 |
-
# fuzzy_control_sys_simulation.compute()
|
87 |
-
# output_values.append(fuzzy_control_sys_simulation.output["output_var"])
|
88 |
-
#
|
89 |
-
# plt.plot(
|
90 |
-
# input_values,
|
91 |
-
# output_values,
|
92 |
-
# label="Sensitivity Analysis"
|
93 |
-
# )
|
94 |
-
# plt.xlabel("Input Variable 1")
|
95 |
-
# plt.ylabel("Output Variable")
|
96 |
-
# plt.legend()
|
97 |
-
# plt.show()
|
98 |
-
#
|
99 |
-
# return fuzzy_control_sys_simulation.output["output_var"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/exploratory_analysis.py
DELETED
@@ -1,130 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import sklearn.metrics
|
3 |
-
from sklearn.cluster import KMeans
|
4 |
-
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
|
5 |
-
from factor_analyzer.factor_analyzer import calculate_kmo
|
6 |
-
|
7 |
-
from coding.llh.visualization.draw_heat_map import draw_heat_map
|
8 |
-
from coding.llh.visualization.draw_scatter import draw_scatter_2D, draw_scatter_2D_1, draw_scatter_3D_1, draw_scatter_3D
|
9 |
-
|
10 |
-
|
11 |
-
# K-means
|
12 |
-
def k_means(array: np.ndarray):
|
13 |
-
info = {}
|
14 |
-
|
15 |
-
draw_scatter_2D_1(array, "2D scatter data before k-means")
|
16 |
-
draw_scatter_3D_1(array, "3D scatter data before k-means")
|
17 |
-
|
18 |
-
K = 60
|
19 |
-
|
20 |
-
info["Number of clustering centers"] = K
|
21 |
-
|
22 |
-
k_means_model = KMeans(n_clusters=K, init='k-means++')
|
23 |
-
|
24 |
-
k_means_model.fit(array)
|
25 |
-
|
26 |
-
sum_of_squared_errors = k_means_model.inertia_
|
27 |
-
|
28 |
-
info["SSE"] = sum_of_squared_errors
|
29 |
-
|
30 |
-
draw_scatter_2D(array, k_means_model.labels_, k_means_model.cluster_centers_, "2D scatter data after k-means")
|
31 |
-
draw_scatter_3D(array, k_means_model.labels_, k_means_model.cluster_centers_, "3D scatter data after k-means")
|
32 |
-
|
33 |
-
result = k_means_model.fit_predict(array[:200])
|
34 |
-
|
35 |
-
silhouette_score = sklearn.metrics.silhouette_score(array[:200], result)
|
36 |
-
|
37 |
-
info["Silhouette score"] = silhouette_score
|
38 |
-
|
39 |
-
return info
|
40 |
-
|
41 |
-
|
42 |
-
# Bartlett sphericity test
|
43 |
-
def bartlett_test(df):
|
44 |
-
_, p_value = calculate_bartlett_sphericity(df)
|
45 |
-
|
46 |
-
return p_value
|
47 |
-
|
48 |
-
|
49 |
-
# KMO test
|
50 |
-
def kmo_test(df):
|
51 |
-
_, kmo_score = calculate_kmo(df)
|
52 |
-
|
53 |
-
return kmo_score
|
54 |
-
|
55 |
-
|
56 |
-
# Principal component analysis
|
57 |
-
def pca(df):
|
58 |
-
# Only consider the correlation of the independent variables
|
59 |
-
info = {}
|
60 |
-
|
61 |
-
# array_x = df.iloc[:, 1:]
|
62 |
-
array_x = df.iloc[:, :]
|
63 |
-
array_y = df.iloc[:, :1]
|
64 |
-
|
65 |
-
# Bartlett sphericity test
|
66 |
-
p_value = bartlett_test(array_x)
|
67 |
-
info["p value of bartlett sphericity test"] = p_value
|
68 |
-
if p_value < 0.05:
|
69 |
-
info["Result of bartlett sphericity test"] = "Accept"
|
70 |
-
else:
|
71 |
-
info["Result of bartlett sphericity test"] = "Reject"
|
72 |
-
|
73 |
-
# KMO test
|
74 |
-
kmo_score = kmo_test(array_x)
|
75 |
-
info["Score of KMO test"] = kmo_score
|
76 |
-
if kmo_score > 0.5:
|
77 |
-
info["Result of KMO test"] = "Accept"
|
78 |
-
else:
|
79 |
-
info["Result of KMO test"] = "Reject"
|
80 |
-
|
81 |
-
# get the matrix of correlation coefficients
|
82 |
-
covX = np.around(np.corrcoef(array_x.T), decimals=3)
|
83 |
-
|
84 |
-
# 计算协方差矩阵的对角线元素的标准差
|
85 |
-
std_dev = np.sqrt(np.diag(covX))
|
86 |
-
|
87 |
-
# 计算皮尔逊相关系数矩阵
|
88 |
-
pearson_matrix = covX / np.outer(std_dev, std_dev)
|
89 |
-
|
90 |
-
# draw_heat_map(pearson_matrix, "pearson matrix", True, df.columns.values)
|
91 |
-
|
92 |
-
# Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
|
93 |
-
eigenvalues, eigenvectors = np.linalg.eig(covX.T)
|
94 |
-
|
95 |
-
eigenvalues = np.around(eigenvalues, decimals=3)
|
96 |
-
|
97 |
-
eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
|
98 |
-
|
99 |
-
# Sort feature values in descending order
|
100 |
-
eigenvalues = sorted(eigenvalues, reverse=True)
|
101 |
-
|
102 |
-
for i, value in enumerate(eigenvalues):
|
103 |
-
if i == 0:
|
104 |
-
sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
|
105 |
-
else:
|
106 |
-
sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
|
107 |
-
|
108 |
-
# draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
|
109 |
-
|
110 |
-
# get the contribution of the eigenvalues
|
111 |
-
contribution = eigenvalues / np.sum(eigenvalues)
|
112 |
-
|
113 |
-
# get the cumulative contribution of the eigenvalues
|
114 |
-
cumulative_contribution = np.cumsum(contribution)
|
115 |
-
|
116 |
-
# Selection of principal components
|
117 |
-
main_factors_index = [i for i in range(len(cumulative_contribution)) if cumulative_contribution[i] < 0.80]
|
118 |
-
|
119 |
-
main_factor_num = len(main_factors_index)
|
120 |
-
|
121 |
-
info["Main factor num"] = main_factor_num
|
122 |
-
|
123 |
-
# Get the projection matrix
|
124 |
-
projected_array = array_x.dot(sorted_eigenvectors[:, :main_factor_num])
|
125 |
-
projected_array = np.concatenate((array_y.values, projected_array), axis=1)
|
126 |
-
|
127 |
-
return projected_array, info
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/gaussian_model.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
from sklearn.mixture import GaussianMixture
|
4 |
-
|
5 |
-
|
6 |
-
def gaussian_mix(x):
|
7 |
-
x = x.reshape(-1, 1)
|
8 |
-
n_components = 2000 # 你可以根据需要调整混合组件的数量
|
9 |
-
gmm = GaussianMixture(n_components=n_components, covariance_type='full')
|
10 |
-
|
11 |
-
# 拟合模型
|
12 |
-
gmm.fit(x)
|
13 |
-
|
14 |
-
# 预测每个数据点所属的组件
|
15 |
-
continuous_data = gmm.sample(len(x))[0].reshape(-1)
|
16 |
-
|
17 |
-
return continuous_data
|
18 |
-
|
19 |
-
# 使用高斯混合模型拟合数据
|
20 |
-
# gmm = GaussianMixture(n_components=50) # 选择混合成分的数量
|
21 |
-
# gmm.fit(x.reshape(-1, 1))
|
22 |
-
|
23 |
-
# 生成连续数据
|
24 |
-
# return np.linspace(min(x), max(x), len(x)).flatten()
|
25 |
-
|
26 |
-
# z = np.exp(gmm.score_samples(y.reshape(-1, 1)))
|
27 |
-
|
28 |
-
# return z
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/gradient_model.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from sklearn.ensemble import GradientBoostingRegressor
|
3 |
-
from sklearn.model_selection import learning_curve
|
4 |
-
|
5 |
-
from analysis.shap_model import draw_shap_beeswarm
|
6 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
7 |
-
from static.config import Config
|
8 |
-
from static.new_class import Container
|
9 |
-
from static.process import grid_search, bayes_search
|
10 |
-
|
11 |
-
|
12 |
-
class GradientBoostingParams:
|
13 |
-
@classmethod
|
14 |
-
def get_params(cls):
|
15 |
-
return {
|
16 |
-
'n_estimators': [50, 100, 150],
|
17 |
-
'learning_rate': [0.01, 0.1, 0.2],
|
18 |
-
'max_depth': [3, 5, 7],
|
19 |
-
'min_samples_split': [2, 5, 10],
|
20 |
-
'min_samples_leaf': [1, 2, 4]
|
21 |
-
}
|
22 |
-
|
23 |
-
|
24 |
-
# 梯度提升回归
|
25 |
-
def gradient_boosting_regression(container: Container):
|
26 |
-
x_train = container.x_train
|
27 |
-
y_train = container.y_train
|
28 |
-
x_test = container.x_test
|
29 |
-
y_test = container.y_test
|
30 |
-
hyper_params_optimize = container.hyper_params_optimize
|
31 |
-
info = {}
|
32 |
-
|
33 |
-
gradient_boosting_regression_model = GradientBoostingRegressor(random_state=Config.RANDOM_STATE)
|
34 |
-
params = GradientBoostingParams.get_params()
|
35 |
-
|
36 |
-
if hyper_params_optimize == "grid_search":
|
37 |
-
best_model = grid_search(params, gradient_boosting_regression_model, x_train, y_train)
|
38 |
-
elif hyper_params_optimize == "bayes_search":
|
39 |
-
best_model = bayes_search(params, gradient_boosting_regression_model, x_train, y_train)
|
40 |
-
else:
|
41 |
-
best_model = gradient_boosting_regression_model
|
42 |
-
best_model.fit(x_train, y_train)
|
43 |
-
|
44 |
-
info["参数"] = best_model.get_params()
|
45 |
-
|
46 |
-
y_pred = best_model.predict(x_test)
|
47 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
48 |
-
container.set_y_pred(y_pred)
|
49 |
-
|
50 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
51 |
-
|
52 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
53 |
-
train_scores_std = np.std(train_scores, axis=1)
|
54 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
55 |
-
test_scores_std = np.std(test_scores, axis=1)
|
56 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
57 |
-
test_scores_std)
|
58 |
-
|
59 |
-
info["指标"] = calculate_regression_metrics(y_pred, y_test)
|
60 |
-
|
61 |
-
container.set_info(info)
|
62 |
-
container.set_status("trained")
|
63 |
-
container.set_model(best_model)
|
64 |
-
|
65 |
-
return container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/kernel_model.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from sklearn.model_selection import learning_curve
|
3 |
-
from sklearn.svm import SVC
|
4 |
-
from sklearn.svm import SVR
|
5 |
-
|
6 |
-
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
7 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
8 |
-
from static.config import Config
|
9 |
-
from static.new_class import Container
|
10 |
-
from static.process import grid_search, bayes_search
|
11 |
-
|
12 |
-
|
13 |
-
class SVMRegressionParams:
|
14 |
-
@classmethod
|
15 |
-
def get_params(cls):
|
16 |
-
return {
|
17 |
-
'kernel': ['linear', 'rbf'],
|
18 |
-
'C': [0.1, 1, 10, 100],
|
19 |
-
'gamma': [0.01, 0.1, 1, 10],
|
20 |
-
'epsilon': [0.01, 0.1, 1]
|
21 |
-
}
|
22 |
-
|
23 |
-
|
24 |
-
# 支持向量机回归
|
25 |
-
def svm_regression(container: Container):
|
26 |
-
x_train = container.x_train
|
27 |
-
y_train = container.y_train
|
28 |
-
x_test = container.x_test
|
29 |
-
y_test = container.y_test
|
30 |
-
hyper_params_optimize = container.hyper_params_optimize
|
31 |
-
info = {}
|
32 |
-
|
33 |
-
svm_regression_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
|
34 |
-
params = SVMRegressionParams.get_params()
|
35 |
-
|
36 |
-
if hyper_params_optimize == "grid_search":
|
37 |
-
best_model = grid_search(params, svm_regression_model, x_train, y_train)
|
38 |
-
elif hyper_params_optimize == "bayes_search":
|
39 |
-
best_model = bayes_search(params, svm_regression_model, x_train, y_train)
|
40 |
-
else:
|
41 |
-
best_model = svm_regression_model
|
42 |
-
best_model.fit(x_train, y_train)
|
43 |
-
|
44 |
-
info["参数"] = best_model.get_params()
|
45 |
-
|
46 |
-
y_pred = best_model.predict(x_test)
|
47 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
48 |
-
container.set_y_pred(y_pred)
|
49 |
-
|
50 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
51 |
-
|
52 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
53 |
-
train_scores_std = np.std(train_scores, axis=1)
|
54 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
55 |
-
test_scores_std = np.std(test_scores, axis=1)
|
56 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
57 |
-
test_scores_std)
|
58 |
-
|
59 |
-
info["指标"] = calculate_regression_metrics(y_pred, y_test)
|
60 |
-
|
61 |
-
container.set_info(info)
|
62 |
-
container.set_status("trained")
|
63 |
-
container.set_model(best_model)
|
64 |
-
|
65 |
-
return container
|
66 |
-
|
67 |
-
|
68 |
-
class SVMClassifierParams:
|
69 |
-
@classmethod
|
70 |
-
def get_params(cls):
|
71 |
-
return {
|
72 |
-
"C": [0.1, 1, 10, 100],
|
73 |
-
"kernel": ['linear', 'rbf', 'poly'],
|
74 |
-
"gamma": [0.1, 1, 10]
|
75 |
-
}
|
76 |
-
|
77 |
-
|
78 |
-
# 支持向量机分类
|
79 |
-
def svm_classifier(container: Container):
|
80 |
-
x_train = container.x_train
|
81 |
-
y_train = container.y_train
|
82 |
-
x_test = container.x_test
|
83 |
-
y_test = container.y_test
|
84 |
-
hyper_params_optimize = container.hyper_params_optimize
|
85 |
-
info = {}
|
86 |
-
|
87 |
-
svm_classifier_model = SVC(kernel="rbf")
|
88 |
-
params = SVMClassifierParams.get_params()
|
89 |
-
|
90 |
-
if hyper_params_optimize == "grid_search":
|
91 |
-
best_model = grid_search(params, svm_classifier_model, x_train, y_train)
|
92 |
-
elif hyper_params_optimize == "bayes_search":
|
93 |
-
best_model = bayes_search(params, svm_classifier_model, x_train, y_train)
|
94 |
-
else:
|
95 |
-
best_model = svm_classifier_model
|
96 |
-
best_model.fit(x_train, y_train)
|
97 |
-
|
98 |
-
info["参数"] = best_model.get_params()
|
99 |
-
|
100 |
-
y_pred = best_model.predict(x_test)
|
101 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
102 |
-
container.set_y_pred(y_pred)
|
103 |
-
|
104 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
105 |
-
|
106 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
107 |
-
train_scores_std = np.std(train_scores, axis=1)
|
108 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
109 |
-
test_scores_std = np.std(test_scores, axis=1)
|
110 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
111 |
-
test_scores_std)
|
112 |
-
|
113 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
114 |
-
|
115 |
-
container.set_info(info)
|
116 |
-
container.set_status("trained")
|
117 |
-
container.set_model(best_model)
|
118 |
-
|
119 |
-
return container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/linear_model.py
DELETED
@@ -1,217 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from sklearn.linear_model import LinearRegression
|
3 |
-
from sklearn.preprocessing import PolynomialFeatures
|
4 |
-
from sklearn.linear_model import Lasso
|
5 |
-
from sklearn.linear_model import Ridge
|
6 |
-
from sklearn.linear_model import ElasticNet
|
7 |
-
from sklearn.linear_model import LogisticRegression
|
8 |
-
from sklearn.pipeline import Pipeline
|
9 |
-
from sklearn.model_selection import learning_curve
|
10 |
-
|
11 |
-
from static.process import grid_search, bayes_search
|
12 |
-
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
13 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
14 |
-
from static.new_class import *
|
15 |
-
from static.config import Config
|
16 |
-
|
17 |
-
|
18 |
-
class LinearRegressionParams:
|
19 |
-
@classmethod
|
20 |
-
def get_params(cls, sort):
|
21 |
-
if sort in ["Lasso", "Ridge", "ElasticNet"]:
|
22 |
-
return {
|
23 |
-
"fit_intercept": [True, False],
|
24 |
-
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0],
|
25 |
-
"random_state": [Config.RANDOM_STATE]
|
26 |
-
}
|
27 |
-
else:
|
28 |
-
return {
|
29 |
-
"fit_intercept": [True, False]
|
30 |
-
}
|
31 |
-
|
32 |
-
|
33 |
-
# 线性回归
|
34 |
-
def linear_regression(container: Container, model=None):
|
35 |
-
x_train = container.x_train
|
36 |
-
y_train = container.y_train
|
37 |
-
x_test = container.x_test
|
38 |
-
y_test = container.y_test
|
39 |
-
hyper_params_optimize = container.hyper_params_optimize
|
40 |
-
info = {}
|
41 |
-
|
42 |
-
if model == "Lasso":
|
43 |
-
linear_regression_model = Lasso(alpha=0.1, random_state=Config.RANDOM_STATE)
|
44 |
-
params = LinearRegressionParams.get_params(model)
|
45 |
-
elif model == "Ridge":
|
46 |
-
linear_regression_model = Ridge(alpha=0.1, random_state=Config.RANDOM_STATE)
|
47 |
-
params = LinearRegressionParams.get_params(model)
|
48 |
-
elif model == "ElasticNet":
|
49 |
-
linear_regression_model = ElasticNet(alpha=0.1, random_state=Config.RANDOM_STATE)
|
50 |
-
params = LinearRegressionParams.get_params(model)
|
51 |
-
elif model == "LinearRegression":
|
52 |
-
linear_regression_model = LinearRegression()
|
53 |
-
params = LinearRegressionParams.get_params(model)
|
54 |
-
else:
|
55 |
-
linear_regression_model = LinearRegression()
|
56 |
-
params = LinearRegressionParams.get_params(model)
|
57 |
-
|
58 |
-
if hyper_params_optimize == "grid_search":
|
59 |
-
best_model = grid_search(params, linear_regression_model, x_train, y_train)
|
60 |
-
elif hyper_params_optimize == "bayes_search":
|
61 |
-
best_model = bayes_search(params, linear_regression_model, x_train, y_train)
|
62 |
-
else:
|
63 |
-
best_model = linear_regression_model
|
64 |
-
best_model.fit(x_train, y_train)
|
65 |
-
|
66 |
-
info["参数"] = best_model.get_params()
|
67 |
-
|
68 |
-
# lr_intercept = best_model.intercept_
|
69 |
-
# info["Intercept of linear regression equation"] = lr_intercept
|
70 |
-
#
|
71 |
-
# lr_coef = best_model.coef_
|
72 |
-
# info["Coefficients of linear regression equation"] = lr_coef
|
73 |
-
|
74 |
-
y_pred = best_model.predict(x_test)
|
75 |
-
container.set_y_pred(y_pred)
|
76 |
-
|
77 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
78 |
-
|
79 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
80 |
-
train_scores_std = np.std(train_scores, axis=1)
|
81 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
82 |
-
test_scores_std = np.std(test_scores, axis=1)
|
83 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
84 |
-
test_scores_std)
|
85 |
-
|
86 |
-
info["参数"] = calculate_regression_metrics(y_pred, y_test)
|
87 |
-
|
88 |
-
container.set_info(info)
|
89 |
-
container.set_status("trained")
|
90 |
-
container.set_model(best_model)
|
91 |
-
|
92 |
-
return container
|
93 |
-
|
94 |
-
|
95 |
-
class PolynomialRegressionParams:
|
96 |
-
@classmethod
|
97 |
-
def get_params(cls):
|
98 |
-
return {
|
99 |
-
"polynomial_features__degree": [2, 3],
|
100 |
-
"linear_regression_model__fit_intercept": [True, False]
|
101 |
-
}
|
102 |
-
|
103 |
-
|
104 |
-
# 多项式回归
|
105 |
-
def polynomial_regression(container: Container):
|
106 |
-
x_train = container.x_train
|
107 |
-
y_train = container.y_train
|
108 |
-
x_test = container.x_test
|
109 |
-
y_test = container.y_test
|
110 |
-
hyper_params_optimize = container.hyper_params_optimize
|
111 |
-
info = {}
|
112 |
-
|
113 |
-
polynomial_features = PolynomialFeatures(degree=2)
|
114 |
-
linear_regression_model = LinearRegression()
|
115 |
-
|
116 |
-
polynomial_regression_model = Pipeline([("polynomial_features", polynomial_features),
|
117 |
-
("linear_regression_model", linear_regression_model)])
|
118 |
-
params = PolynomialRegressionParams.get_params()
|
119 |
-
|
120 |
-
if hyper_params_optimize == "grid_search":
|
121 |
-
best_model = grid_search(params, polynomial_regression_model, x_train, y_train)
|
122 |
-
elif hyper_params_optimize == "bayes_search":
|
123 |
-
best_model = bayes_search(params, polynomial_regression_model, x_train, y_train)
|
124 |
-
else:
|
125 |
-
best_model = polynomial_regression_model
|
126 |
-
best_model.fit(x_train, y_train)
|
127 |
-
|
128 |
-
info["参数"] = best_model.get_params()
|
129 |
-
|
130 |
-
# feature_names = best_model["polynomial_features"].get_feature_names_out()
|
131 |
-
# info["Feature names of polynomial regression"] = feature_names
|
132 |
-
#
|
133 |
-
# lr_intercept = best_model["linear_regression_model"].intercept_
|
134 |
-
# info["Intercept of polynomial regression equation"] = lr_intercept
|
135 |
-
#
|
136 |
-
# lr_coef = best_model["linear_regression_model"].coef_
|
137 |
-
# info["Coefficients of polynomial regression equation"] = lr_coef
|
138 |
-
|
139 |
-
x_test_ = best_model["polynomial_features"].fit_transform(x_test)
|
140 |
-
y_pred = best_model["linear_regression_model"].predict(x_test_)
|
141 |
-
container.set_y_pred(y_pred)
|
142 |
-
|
143 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
144 |
-
|
145 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
146 |
-
train_scores_std = np.std(train_scores, axis=1)
|
147 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
148 |
-
test_scores_std = np.std(test_scores, axis=1)
|
149 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
150 |
-
|
151 |
-
info["指标"] = calculate_regression_metrics(y_pred, y_test)
|
152 |
-
|
153 |
-
container.set_info(info)
|
154 |
-
container.set_status("trained")
|
155 |
-
container.set_model(best_model)
|
156 |
-
|
157 |
-
return container
|
158 |
-
|
159 |
-
|
160 |
-
class LogisticRegressionParams:
|
161 |
-
@classmethod
|
162 |
-
def get_params(cls):
|
163 |
-
return {
|
164 |
-
"C": [0.001, 0.01, 0.1, 1.0, 10.0],
|
165 |
-
"max_iter": [100, 200, 300],
|
166 |
-
"solver": ["liblinear", "lbfgs", "newton-cg", "sag", "saga"],
|
167 |
-
"random_state": [Config.RANDOM_STATE]
|
168 |
-
}
|
169 |
-
|
170 |
-
|
171 |
-
# 逻辑斯谛分类
|
172 |
-
def logistic_regression(container: Container):
|
173 |
-
x_train = container.x_train
|
174 |
-
y_train = container.y_train
|
175 |
-
x_test = container.x_test
|
176 |
-
y_test = container.y_test
|
177 |
-
hyper_params_optimize = container.hyper_params_optimize
|
178 |
-
info = {}
|
179 |
-
|
180 |
-
logistic_regression_model = LogisticRegression(random_state=Config.RANDOM_STATE)
|
181 |
-
params = LogisticRegressionParams.get_params()
|
182 |
-
|
183 |
-
if hyper_params_optimize == "grid_search":
|
184 |
-
best_model = grid_search(params, logistic_regression_model, x_train, y_train)
|
185 |
-
elif hyper_params_optimize == "bayes_search":
|
186 |
-
best_model = bayes_search(params, logistic_regression_model, x_train, y_train)
|
187 |
-
else:
|
188 |
-
best_model = logistic_regression_model
|
189 |
-
best_model.fit(x_train, y_train)
|
190 |
-
|
191 |
-
info["参数"] = best_model.get_params()
|
192 |
-
|
193 |
-
# lr_intercept = best_model.intercept_
|
194 |
-
# info["Intercept of logistic regression equation"] = lr_intercept.tolist()
|
195 |
-
#
|
196 |
-
# lr_coef = best_model.coef_
|
197 |
-
# info["Coefficients of logistic regression equation"] = lr_coef.tolist()
|
198 |
-
|
199 |
-
y_pred = best_model.predict(x_test)
|
200 |
-
container.set_y_pred(y_pred)
|
201 |
-
|
202 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
203 |
-
|
204 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
205 |
-
train_scores_std = np.std(train_scores, axis=1)
|
206 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
207 |
-
test_scores_std = np.std(test_scores, axis=1)
|
208 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
209 |
-
test_scores_std)
|
210 |
-
|
211 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
212 |
-
|
213 |
-
container.set_info(info)
|
214 |
-
container.set_status("trained")
|
215 |
-
container.set_model(best_model)
|
216 |
-
|
217 |
-
return container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/markov_model.py
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import pandas as pd
|
3 |
-
from hmmlearn import hmm
|
4 |
-
|
5 |
-
|
6 |
-
def train_and_predict_hidden_markov_model(df):
|
7 |
-
window_size = 10
|
8 |
-
|
9 |
-
# train_df = df[['point_won', 'point_loss', 'ace', 'winner', 'double_fault', 'unf_err', 'net_point', 'net_point_won', 'break_pt', 'break_pt_won', 'break_pt_miss']]
|
10 |
-
|
11 |
-
train_df = df
|
12 |
-
# "p1_winner",
|
13 |
-
# "p2_winner",
|
14 |
-
# "winner_shot_type",
|
15 |
-
# "p1_double_fault",
|
16 |
-
# "p2_double_fault",
|
17 |
-
# "p1_unf_err",
|
18 |
-
# "p2_unf_err",
|
19 |
-
# "p1_net_pt_won",
|
20 |
-
# "p2_net_pt_won",
|
21 |
-
# "p1_break_pt_won",
|
22 |
-
# "p2_break_pt_won",
|
23 |
-
# "rally_count",
|
24 |
-
# "serve_width",
|
25 |
-
# "serve_depth",
|
26 |
-
# "return_depth"
|
27 |
-
df["observation"] = 0
|
28 |
-
|
29 |
-
# mapping = {}
|
30 |
-
# counter = 0
|
31 |
-
# for i in range(len(train_df)):
|
32 |
-
# cur_combination = train_df.iloc[i].to_list()
|
33 |
-
#
|
34 |
-
# if str(cur_combination) not in mapping.keys():
|
35 |
-
# mapping[str(cur_combination)] = counter
|
36 |
-
# df.loc[i, "observation"] = counter
|
37 |
-
# counter += 1
|
38 |
-
# else:
|
39 |
-
# df.loc[i, "observation"] = mapping[str(cur_combination)]
|
40 |
-
|
41 |
-
observation_list = df["observation"].to_list()
|
42 |
-
|
43 |
-
# value_separated_observation_list = [observation_list[i - window_size: i] for i in range(window_size, len(observation_list))]
|
44 |
-
# value_separated_observation_list = [[0] * window_size] * window_size + value_separated_observation_list
|
45 |
-
|
46 |
-
observations = np.array([np.sum(np.array([train_df.iloc[j].to_list() for j in range(i-window_size, i)]).astype(int), axis=0) for i in range(window_size, len(train_df))])
|
47 |
-
|
48 |
-
observations = abs(np.min(observations)) + observations
|
49 |
-
|
50 |
-
observations = observations.astype(int)
|
51 |
-
|
52 |
-
m_observations = np.concatenate(
|
53 |
-
(np.array([observations[0].tolist()] * window_size), observations),
|
54 |
-
axis=0
|
55 |
-
)
|
56 |
-
|
57 |
-
df = pd.concat([df, pd.DataFrame({"window_observation": m_observations.tolist()})], axis=1)
|
58 |
-
|
59 |
-
hidden_markov_model = hmm.MultinomialHMM(n_components=5, n_iter=50, tol=0.01)
|
60 |
-
|
61 |
-
hidden_markov_model.fit(observations)
|
62 |
-
|
63 |
-
start_prob = hidden_markov_model.startprob_
|
64 |
-
transition_prob = hidden_markov_model.transmat_
|
65 |
-
emission_prob = hidden_markov_model.emissionprob_
|
66 |
-
|
67 |
-
neg_log_likelihood, pred = calculate_momentum(df, hidden_markov_model, m_observations)
|
68 |
-
|
69 |
-
_, hidden2observation = hidden_markov_model.score_samples(observations)
|
70 |
-
|
71 |
-
state_impacts = np.sum(hidden2observation, axis=0)
|
72 |
-
|
73 |
-
return state_impacts, neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
74 |
-
|
75 |
-
state_impacts = np.zeros((num_states, num_obs))
|
76 |
-
|
77 |
-
for t in range(num_obs):
|
78 |
-
for i in range(num_states):
|
79 |
-
state_impacts[i, t] = (forward_prob[t, i] * backward_prob[t, i]) / np.sum(
|
80 |
-
forward_prob[t, :] * backward_prob[t, :])
|
81 |
-
|
82 |
-
return neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
83 |
-
|
84 |
-
|
85 |
-
def calculate_momentum(df, hidden_markov_model, m_observations):
|
86 |
-
# pred_list = []
|
87 |
-
# neg_log_likelihood_list = []
|
88 |
-
# for i in range(len(df)):
|
89 |
-
# neg_log_likelihood, pred = hidden_markov_model.decode(np.array([df.loc[i, "window_observation"]]))
|
90 |
-
# pred_list.append(pred[0])
|
91 |
-
# neg_log_likelihood_list.append(neg_log_likelihood)
|
92 |
-
#
|
93 |
-
# return pred_list, neg_log_likelihood_list
|
94 |
-
|
95 |
-
neg_log_likelihood, pred = hidden_markov_model.decode(m_observations)
|
96 |
-
|
97 |
-
return neg_log_likelihood, pred
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/model_train/tree_model.py
CHANGED
@@ -99,21 +99,12 @@ class DecisionTreeClassifierParams:
|
|
99 |
|
100 |
# 决策树分类
|
101 |
def decision_tree_classifier(container, params):
|
102 |
-
import logging
|
103 |
-
logging.basicConfig(level=logging.NOTSET)
|
104 |
-
logging.info(str(params), logging.getLevelName(logging.INFO))
|
105 |
-
print(str(params))
|
106 |
-
|
107 |
-
|
108 |
x_train, y_train, x_test, y_test, hyper_params_optimize = get_values_from_container_class(container)
|
109 |
info = {}
|
110 |
|
111 |
params = transform_params_list(DecisionTreeClassifierParams, params)
|
112 |
params['random_state'] = [StaticValue.RANDOM_STATE]
|
113 |
|
114 |
-
logging.info(str(params), logging.getLevelName(logging.INFO))
|
115 |
-
print(str(params))
|
116 |
-
|
117 |
random_forest_regression_model = DecisionTreeClassifier(random_state=StaticValue.RANDOM_STATE)
|
118 |
|
119 |
if hyper_params_optimize == "grid_search":
|
|
|
99 |
|
100 |
# 决策树分类
|
101 |
def decision_tree_classifier(container, params):
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
x_train, y_train, x_test, y_test, hyper_params_optimize = get_values_from_container_class(container)
|
103 |
info = {}
|
104 |
|
105 |
params = transform_params_list(DecisionTreeClassifierParams, params)
|
106 |
params['random_state'] = [StaticValue.RANDOM_STATE]
|
107 |
|
|
|
|
|
|
|
108 |
random_forest_regression_model = DecisionTreeClassifier(random_state=StaticValue.RANDOM_STATE)
|
109 |
|
110 |
if hyper_params_optimize == "grid_search":
|
analysis/my_learning_curve.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
from sklearn.metrics import r2_score
|
4 |
-
from sklearn.model_selection import train_test_split
|
5 |
-
from sklearn.metrics import accuracy_score
|
6 |
-
|
7 |
-
from coding.llh.metrics.calculate_regression_metrics import calculate_ar2
|
8 |
-
|
9 |
-
|
10 |
-
def my_learning_curve(estimator, X, y, cv=5):
|
11 |
-
train_sizes = np.linspace(0.1, 1.0, 10)[:-1]
|
12 |
-
train_scores = []
|
13 |
-
val_scores = []
|
14 |
-
|
15 |
-
for train_size in train_sizes:
|
16 |
-
# Split the dataset into training and validation sets
|
17 |
-
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, random_state=42)
|
18 |
-
|
19 |
-
# Train the model on the training set
|
20 |
-
# estimator.fit(X_train, y_train)
|
21 |
-
|
22 |
-
# Evaluate the model on the training set
|
23 |
-
y_train_pred = estimator.predict(X_train)
|
24 |
-
train_accuracy = r2_score(y_train, y_train_pred)
|
25 |
-
train_scores.append(train_accuracy)
|
26 |
-
|
27 |
-
# Evaluate the model on the validation set
|
28 |
-
y_val_pred = estimator.predict(X_val)
|
29 |
-
val_accuracy = r2_score(y_val, y_val_pred)
|
30 |
-
val_scores.append(val_accuracy)
|
31 |
-
|
32 |
-
return train_sizes, train_scores, val_scores
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/neural_model.py
DELETED
@@ -1,321 +0,0 @@
|
|
1 |
-
import matplotlib.pyplot as plt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
-
import torch
|
5 |
-
import torch.nn as nn
|
6 |
-
from sklearn import preprocessing
|
7 |
-
from torch.utils.data import TensorDataset
|
8 |
-
from tqdm import tqdm
|
9 |
-
import json
|
10 |
-
import os
|
11 |
-
import warnings
|
12 |
-
from sklearn.neural_network import MLPRegressor
|
13 |
-
|
14 |
-
from coding.llh.analysis.shap_model import shap_calculate
|
15 |
-
from coding.llh.static.process import grid_search, bayes_search
|
16 |
-
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
17 |
-
from sklearn.tree import DecisionTreeClassifier
|
18 |
-
from sklearn.ensemble import RandomForestClassifier
|
19 |
-
from xgboost import XGBClassifier
|
20 |
-
from sklearn.model_selection import learning_curve
|
21 |
-
import numpy as np
|
22 |
-
|
23 |
-
from coding.llh.static.config import Config
|
24 |
-
from coding.llh.static.process import grid_search, bayes_search
|
25 |
-
from coding.llh.visualization.draw_learning_curve import draw_learning_curve
|
26 |
-
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
27 |
-
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
28 |
-
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
29 |
-
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
30 |
-
from sklearn.ensemble import RandomForestRegressor
|
31 |
-
|
32 |
-
warnings.filterwarnings("ignore")
|
33 |
-
|
34 |
-
|
35 |
-
def mlp_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
36 |
-
info = {}
|
37 |
-
model_name = "mlp regression model"
|
38 |
-
|
39 |
-
model = MLPRegressor()
|
40 |
-
params = {
|
41 |
-
'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
|
42 |
-
'activation': ['relu', 'tanh', 'logistic'],
|
43 |
-
'alpha': [0.0001, 0.001, 0.01],
|
44 |
-
'learning_rate': ['constant', 'invscaling', 'adaptive'],
|
45 |
-
'max_iter': [100, 200, 300]
|
46 |
-
}
|
47 |
-
|
48 |
-
if hyper_params_optimize == "grid_search":
|
49 |
-
best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
|
50 |
-
elif hyper_params_optimize == "bayes_search":
|
51 |
-
best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
|
52 |
-
else:
|
53 |
-
best_model = model
|
54 |
-
best_model.fit(x, y)
|
55 |
-
|
56 |
-
info["{} Params".format(model_name)] = best_model.get_params()
|
57 |
-
|
58 |
-
y_pred = best_model.predict(x_test).reshape(-1, 1)
|
59 |
-
|
60 |
-
# 0202:
|
61 |
-
|
62 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x[:500], y[:500], cv=5, scoring="r2")
|
63 |
-
|
64 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
65 |
-
train_scores_std = np.std(train_scores, axis=1)
|
66 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
67 |
-
test_scores_std = np.std(test_scores, axis=1)
|
68 |
-
|
69 |
-
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
70 |
-
|
71 |
-
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
|
72 |
-
|
73 |
-
info.update(calculate_regression_metrics(y_pred, y_test, model_name))
|
74 |
-
# info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
75 |
-
# mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
|
76 |
-
|
77 |
-
# shap_calculate(best_model, x_test, feature_names)
|
78 |
-
|
79 |
-
return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
|
80 |
-
|
81 |
-
|
82 |
-
def ann(df):
|
83 |
-
# 参数初始化
|
84 |
-
lr = 0.0001
|
85 |
-
batch_size = 32
|
86 |
-
input_dim = 10
|
87 |
-
output_dim = 4
|
88 |
-
epochs = 40
|
89 |
-
best_acc = 0
|
90 |
-
save_path = "./model/model.pth"
|
91 |
-
|
92 |
-
# 硬件定义
|
93 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
94 |
-
print("Device loaded for training: [{}]".format(device))
|
95 |
-
|
96 |
-
# 数据集分割
|
97 |
-
def split_data(data: pd.DataFrame):
|
98 |
-
data = np.array(data)
|
99 |
-
|
100 |
-
dataX = data[:, 1:]
|
101 |
-
dataY = data[:, :1]
|
102 |
-
|
103 |
-
dataX = np.array(dataX)
|
104 |
-
dataY = np.array(dataY)
|
105 |
-
|
106 |
-
total_size = dataX.shape[0]
|
107 |
-
train_size = int(np.round(0.8 * total_size))
|
108 |
-
|
109 |
-
x_train = dataX[: train_size, :]
|
110 |
-
y_train = dataY[: train_size]
|
111 |
-
|
112 |
-
x_test = dataX[train_size:, :]
|
113 |
-
y_test = dataY[train_size:]
|
114 |
-
|
115 |
-
return x_train, y_train, x_test, y_test, total_size, train_size
|
116 |
-
|
117 |
-
x_train, y_train, x_test, y_test, total_size, train_size = split_data(df)
|
118 |
-
|
119 |
-
# 数据预处理
|
120 |
-
x_train = preprocessing.scale(x_train)
|
121 |
-
x_test = preprocessing.scale(x_test)
|
122 |
-
|
123 |
-
y_train = y_train - 1
|
124 |
-
y_test = y_test - 1
|
125 |
-
|
126 |
-
# 数据格式转换
|
127 |
-
x_train_tensor = torch.from_numpy(x_train).to(torch.float32)
|
128 |
-
y_train_tensor = torch.from_numpy(y_train).to(torch.float32)
|
129 |
-
x_test_tensor = torch.from_numpy(x_test).to(torch.float32)
|
130 |
-
y_test_tensor = torch.from_numpy(y_test).to(torch.float32)
|
131 |
-
|
132 |
-
train_data = TensorDataset(x_train_tensor, y_train_tensor)
|
133 |
-
test_data = TensorDataset(x_test_tensor, y_test_tensor)
|
134 |
-
|
135 |
-
train_loader = torch.utils.data.DataLoader(train_data, batch_size, True)
|
136 |
-
test_loader = torch.utils.data.DataLoader(test_data, batch_size, False)
|
137 |
-
|
138 |
-
print("Data loaded for training: [{}]".format(len(train_data)))
|
139 |
-
print("Data loaded for testing: [{}]".format(len(test_data)))
|
140 |
-
|
141 |
-
# 模型定义
|
142 |
-
class ANN(nn.Module):
|
143 |
-
def __init__(self, input_dim, output_dim):
|
144 |
-
super(ANN, self).__init__()
|
145 |
-
|
146 |
-
self.hidden1 = nn.Sequential(
|
147 |
-
nn.Linear(input_dim, 16, bias=True),
|
148 |
-
nn.ReLU()
|
149 |
-
)
|
150 |
-
self.hidden2 = nn.Sequential(
|
151 |
-
nn.Linear(16, 32, bias=True),
|
152 |
-
nn.ReLU()
|
153 |
-
)
|
154 |
-
self.hidden3 = nn.Sequential(
|
155 |
-
nn.Linear(32, 64, bias=True),
|
156 |
-
nn.ReLU()
|
157 |
-
)
|
158 |
-
self.hidden4 = nn.Sequential(
|
159 |
-
nn.Linear(64, 128, bias=True),
|
160 |
-
nn.ReLU()
|
161 |
-
)
|
162 |
-
self.hidden5 = nn.Sequential(
|
163 |
-
nn.Linear(128, 256, bias=True),
|
164 |
-
nn.ReLU()
|
165 |
-
)
|
166 |
-
self.hidden6 = nn.Sequential(
|
167 |
-
nn.Linear(256, 512, bias=True),
|
168 |
-
nn.ReLU()
|
169 |
-
)
|
170 |
-
self.hidden7 = nn.Sequential(
|
171 |
-
nn.Linear(512, 1024, bias=True),
|
172 |
-
nn.ReLU()
|
173 |
-
)
|
174 |
-
self.hidden8 = nn.Sequential(
|
175 |
-
nn.Linear(1024, output_dim, bias=True),
|
176 |
-
nn.Softmax()
|
177 |
-
)
|
178 |
-
|
179 |
-
def forward(self, x):
|
180 |
-
x = self.hidden1(x)
|
181 |
-
x = self.hidden2(x)
|
182 |
-
x = self.hidden3(x)
|
183 |
-
x = self.hidden4(x)
|
184 |
-
x = self.hidden5(x)
|
185 |
-
x = self.hidden6(x)
|
186 |
-
x = self.hidden7(x)
|
187 |
-
x = self.hidden8(x)
|
188 |
-
|
189 |
-
return x
|
190 |
-
|
191 |
-
model = ANN(input_dim, output_dim).to(device)
|
192 |
-
print("Model set: [{}]".format(model))
|
193 |
-
|
194 |
-
# 损失函数定义
|
195 |
-
criterion = nn.CrossEntropyLoss()
|
196 |
-
print("Criterion set: [{}]".format(type(criterion)))
|
197 |
-
|
198 |
-
# 优化器定义
|
199 |
-
optimizer = torch.optim.Adam(model.parameters(), lr)
|
200 |
-
print("Optimizer set: [{}]".format(type(optimizer)))
|
201 |
-
print()
|
202 |
-
|
203 |
-
if os.path.isfile(save_path):
|
204 |
-
# 模型加载
|
205 |
-
state_dict = torch.load(save_path)
|
206 |
-
model.load_state_dict(state_dict, strict=False)
|
207 |
-
print("!Model loaded")
|
208 |
-
|
209 |
-
with open("./model/best_acc.json", "r") as f:
|
210 |
-
print("Best accuracy of current model: [{}]".format(json.load(f)))
|
211 |
-
|
212 |
-
else:
|
213 |
-
print("!Training starting\n")
|
214 |
-
|
215 |
-
train_loss_list = []
|
216 |
-
train_acc_list = []
|
217 |
-
test_loss_list = []
|
218 |
-
test_acc_list = []
|
219 |
-
|
220 |
-
y_pred_list = []
|
221 |
-
y_real_list = []
|
222 |
-
|
223 |
-
for epoch in range(epochs):
|
224 |
-
# 模型训练
|
225 |
-
model.train()
|
226 |
-
|
227 |
-
train_loss = 0
|
228 |
-
train_acc = 0
|
229 |
-
train_acc_count = 0
|
230 |
-
train_count = 0
|
231 |
-
train_bar = tqdm(train_loader)
|
232 |
-
for data in train_bar:
|
233 |
-
x_train, y_train = data
|
234 |
-
x_train = x_train.to(device)
|
235 |
-
y_train = y_train.to(device)
|
236 |
-
# 优化器重置
|
237 |
-
optimizer.zero_grad()
|
238 |
-
# 前向传播
|
239 |
-
output = model(x_train)
|
240 |
-
# 计算误差
|
241 |
-
loss = criterion(output, y_train.reshape(-1).long())
|
242 |
-
# 反向传播:更新梯度
|
243 |
-
loss.backward()
|
244 |
-
# 反向传播:更新参数
|
245 |
-
optimizer.step()
|
246 |
-
|
247 |
-
train_loss += loss.item()
|
248 |
-
train_bar.desc = "Train epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
|
249 |
-
train_acc_count += (output.argmax(axis=1) == y_train.view(-1).int()).sum().item()
|
250 |
-
train_count += len(x_train)
|
251 |
-
|
252 |
-
train_acc = train_acc_count / train_count
|
253 |
-
|
254 |
-
# 模型测试
|
255 |
-
model.eval()
|
256 |
-
|
257 |
-
test_loss = 0
|
258 |
-
test_acc = 0
|
259 |
-
test_acc_count = 0
|
260 |
-
test_count = 0
|
261 |
-
with torch.no_grad():
|
262 |
-
test_bar = tqdm(test_loader)
|
263 |
-
for data in test_bar:
|
264 |
-
x_test, y_test = data
|
265 |
-
x_test = x_test.to(device)
|
266 |
-
y_test = y_test.to(device)
|
267 |
-
# 前向传播
|
268 |
-
output = model(x_test)
|
269 |
-
|
270 |
-
y_pred_list.append(output.tolist())
|
271 |
-
y_real_list.append(y_test.tolist())
|
272 |
-
|
273 |
-
# 计算误差
|
274 |
-
loss = criterion(output, y_test.reshape(-1).long())
|
275 |
-
|
276 |
-
test_loss += loss.item()
|
277 |
-
test_bar.desc = "Test epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
|
278 |
-
test_acc_count += (output.argmax(axis=1) == y_test.view(-1).int()).sum().item()
|
279 |
-
test_count += len(x_test)
|
280 |
-
|
281 |
-
test_acc = test_acc_count / test_count
|
282 |
-
|
283 |
-
print("\nEpoch: {}".format(epoch + 1))
|
284 |
-
print("Train_loss: {:.4f}".format(train_loss))
|
285 |
-
print("Train_accuracy: {:.4f}".format(train_acc))
|
286 |
-
print("Test_loss: {:.4f}".format(test_loss))
|
287 |
-
print("Test_accuracy: {:.4f}".format(test_acc))
|
288 |
-
print("\n")
|
289 |
-
|
290 |
-
train_loss_list.append(train_loss)
|
291 |
-
train_acc_list.append(train_acc)
|
292 |
-
test_loss_list.append(test_loss)
|
293 |
-
test_acc_list.append(test_acc)
|
294 |
-
|
295 |
-
# 保存当前最优模型和最优准确率值
|
296 |
-
if test_acc > best_acc:
|
297 |
-
best_acc = test_acc
|
298 |
-
with open("./model/info.json", "w") as f:
|
299 |
-
json.dump({
|
300 |
-
"best_acc": [best_acc],
|
301 |
-
"train_loss_list": train_loss_list,
|
302 |
-
"train_acc_list": train_acc_list,
|
303 |
-
"test_loss_list": test_loss_list,
|
304 |
-
"test_acc_list": test_acc_list,
|
305 |
-
"y_pred_list": y_pred_list,
|
306 |
-
"y_real_list": y_real_list
|
307 |
-
}, f)
|
308 |
-
|
309 |
-
torch.save(model.state_dict(), save_path)
|
310 |
-
|
311 |
-
print("\n!Training finished")
|
312 |
-
print("Best accuracy: {:.4f}".format(best_acc))
|
313 |
-
|
314 |
-
# 数据可视化
|
315 |
-
draw_line_graph(
|
316 |
-
range(len(y_pred_list)),
|
317 |
-
[y_pred_list, y_real_list],
|
318 |
-
"ANN prediction",
|
319 |
-
["predict, real"]
|
320 |
-
)
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/others/evaluation_model.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import skfuzzy as fuzz
|
3 |
-
from skfuzzy import control as ctrl
|
4 |
-
import matplotlib.pyplot as plt
|
5 |
-
|
6 |
-
|
7 |
-
def fuzzy_comprehensive_evaluation_model():
|
8 |
-
# 创建模糊变量和模糊集合
|
9 |
-
technical_skill = ctrl.Antecedent(np.arange(0, 101, 1), 'technical_skill')
|
10 |
-
physical_condition = ctrl.Antecedent(np.arange(0, 101, 1), 'physical_condition')
|
11 |
-
mental_toughness = ctrl.Antecedent(np.arange(0, 101, 1), 'mental_toughness')
|
12 |
-
opponent_strength = ctrl.Antecedent(np.arange(0, 101, 1), 'opponent_strength')
|
13 |
-
|
14 |
-
performance = ctrl.Consequent(np.arange(0, 101, 1), 'performance')
|
15 |
-
|
16 |
-
# 设定模糊隶属度函数
|
17 |
-
technical_skill['low'] = fuzz.trimf(technical_skill.universe, [0, 0, 50])
|
18 |
-
technical_skill['medium'] = fuzz.trimf(technical_skill.universe, [0, 50, 100])
|
19 |
-
technical_skill['high'] = fuzz.trimf(technical_skill.universe, [50, 100, 100])
|
20 |
-
|
21 |
-
physical_condition['low'] = fuzz.trimf(physical_condition.universe, [0, 0, 50])
|
22 |
-
physical_condition['medium'] = fuzz.trimf(physical_condition.universe, [0, 50, 100])
|
23 |
-
physical_condition['high'] = fuzz.trimf(physical_condition.universe, [50, 100, 100])
|
24 |
-
|
25 |
-
mental_toughness['low'] = fuzz.trimf(mental_toughness.universe, [0, 0, 50])
|
26 |
-
mental_toughness['medium'] = fuzz.trimf(mental_toughness.universe, [0, 50, 100])
|
27 |
-
mental_toughness['high'] = fuzz.trimf(mental_toughness.universe, [50, 100, 100])
|
28 |
-
|
29 |
-
opponent_strength['low'] = fuzz.trimf(opponent_strength.universe, [0, 0, 50])
|
30 |
-
opponent_strength['medium'] = fuzz.trimf(opponent_strength.universe, [0, 50, 100])
|
31 |
-
opponent_strength['high'] = fuzz.trimf(opponent_strength.universe, [50, 100, 100])
|
32 |
-
|
33 |
-
performance['poor'] = fuzz.trimf(performance.universe, [0, 0, 50])
|
34 |
-
performance['average'] = fuzz.trimf(performance.universe, [0, 50, 100])
|
35 |
-
performance['excellent'] = fuzz.trimf(performance.universe, [50, 100, 100])
|
36 |
-
|
37 |
-
# 设定输出的解模糊方法——质心解模糊方式
|
38 |
-
performance.defuzzify_method = 'centroid'
|
39 |
-
|
40 |
-
# 设定规则
|
41 |
-
rule1 = ctrl.Rule(
|
42 |
-
technical_skill['low'] | physical_condition['low'] | mental_toughness['low'] | opponent_strength['low'],
|
43 |
-
performance['poor']
|
44 |
-
)
|
45 |
-
rule2 = ctrl.Rule(
|
46 |
-
technical_skill['medium'] | physical_condition['medium'] | mental_toughness['medium'] | opponent_strength['medium'],
|
47 |
-
performance['average']
|
48 |
-
)
|
49 |
-
rule3 = ctrl.Rule(
|
50 |
-
technical_skill['high'] | physical_condition['high'] | mental_toughness['high'] | opponent_strength['high'],
|
51 |
-
performance['excellent']
|
52 |
-
)
|
53 |
-
|
54 |
-
# 创建控制系统
|
55 |
-
performance_evaluation = ctrl.ControlSystem([rule1, rule2, rule3])
|
56 |
-
performance_evaluator = ctrl.ControlSystemSimulation(performance_evaluation)
|
57 |
-
|
58 |
-
# 输入数据
|
59 |
-
performance_evaluator.input['technical_skill'] = 75
|
60 |
-
performance_evaluator.input['physical_condition'] = 80
|
61 |
-
performance_evaluator.input['mental_toughness'] = 85
|
62 |
-
performance_evaluator.input['opponent_strength'] = 60
|
63 |
-
|
64 |
-
# 计算模糊综合评分
|
65 |
-
performance_evaluator.compute()
|
66 |
-
|
67 |
-
# 输出结果
|
68 |
-
print("模糊综合评分:", performance_evaluator.output['performance'])
|
69 |
-
|
70 |
-
# 打印模糊集合的可视化图表
|
71 |
-
technical_skill.view("technical_skill", sim=performance_evaluator)
|
72 |
-
physical_condition.view("physical_condition", sim=performance_evaluator)
|
73 |
-
mental_toughness.view("mental_toughness", sim=performance_evaluator)
|
74 |
-
opponent_strength.view("opponent_strength", sim=performance_evaluator)
|
75 |
-
performance.view("performance", sim=performance_evaluator)
|
76 |
-
|
77 |
-
# Perform sensitivity analyze (to change input value)
|
78 |
-
|
79 |
-
# input_var_1:
|
80 |
-
|
81 |
-
# input_values = np.arange(0, 11, 1)
|
82 |
-
# output_values = []
|
83 |
-
#
|
84 |
-
# for val in input_values:
|
85 |
-
# fuzzy_control_sys_simulation.input["input_var_1"] = val
|
86 |
-
# fuzzy_control_sys_simulation.compute()
|
87 |
-
# output_values.append(fuzzy_control_sys_simulation.output["output_var"])
|
88 |
-
#
|
89 |
-
# plt.plot(
|
90 |
-
# input_values,
|
91 |
-
# output_values,
|
92 |
-
# label="Sensitivity Analysis"
|
93 |
-
# )
|
94 |
-
# plt.xlabel("Input Variable 1")
|
95 |
-
# plt.ylabel("Output Variable")
|
96 |
-
# plt.legend()
|
97 |
-
# plt.show()
|
98 |
-
#
|
99 |
-
# return fuzzy_control_sys_simulation.output["output_var"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/others/gaussian_model.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
from sklearn.mixture import GaussianMixture
|
4 |
-
|
5 |
-
|
6 |
-
def gaussian_mix(x):
|
7 |
-
x = x.reshape(-1, 1)
|
8 |
-
n_components = 2000 # 你可以根据需要调整混合组件的数量
|
9 |
-
gmm = GaussianMixture(n_components=n_components, covariance_type='full')
|
10 |
-
|
11 |
-
# 拟合模型
|
12 |
-
gmm.fit(x)
|
13 |
-
|
14 |
-
# 预测每个数据点所属的组件
|
15 |
-
continuous_data = gmm.sample(len(x))[0].reshape(-1)
|
16 |
-
|
17 |
-
return continuous_data
|
18 |
-
|
19 |
-
# 使用高斯混合模型拟合数据
|
20 |
-
# gmm = GaussianMixture(n_components=50) # 选择混合成分的数量
|
21 |
-
# gmm.fit(x.reshape(-1, 1))
|
22 |
-
|
23 |
-
# 生成连续数据
|
24 |
-
# return np.linspace(min(x), max(x), len(x)).flatten()
|
25 |
-
|
26 |
-
# z = np.exp(gmm.score_samples(y.reshape(-1, 1)))
|
27 |
-
|
28 |
-
# return z
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/others/markov_model.py
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import pandas as pd
|
3 |
-
from hmmlearn import hmm
|
4 |
-
|
5 |
-
|
6 |
-
def train_and_predict_hidden_markov_model(df):
|
7 |
-
window_size = 10
|
8 |
-
|
9 |
-
# train_df = df[['point_won', 'point_loss', 'ace', 'winner', 'double_fault', 'unf_err', 'net_point', 'net_point_won', 'break_pt', 'break_pt_won', 'break_pt_miss']]
|
10 |
-
|
11 |
-
train_df = df
|
12 |
-
# "p1_winner",
|
13 |
-
# "p2_winner",
|
14 |
-
# "winner_shot_type",
|
15 |
-
# "p1_double_fault",
|
16 |
-
# "p2_double_fault",
|
17 |
-
# "p1_unf_err",
|
18 |
-
# "p2_unf_err",
|
19 |
-
# "p1_net_pt_won",
|
20 |
-
# "p2_net_pt_won",
|
21 |
-
# "p1_break_pt_won",
|
22 |
-
# "p2_break_pt_won",
|
23 |
-
# "rally_count",
|
24 |
-
# "serve_width",
|
25 |
-
# "serve_depth",
|
26 |
-
# "return_depth"
|
27 |
-
df["observation"] = 0
|
28 |
-
|
29 |
-
# mapping = {}
|
30 |
-
# counter = 0
|
31 |
-
# for i in range(len(train_df)):
|
32 |
-
# cur_combination = train_df.iloc[i].to_list()
|
33 |
-
#
|
34 |
-
# if str(cur_combination) not in mapping.keys():
|
35 |
-
# mapping[str(cur_combination)] = counter
|
36 |
-
# df.loc[i, "observation"] = counter
|
37 |
-
# counter += 1
|
38 |
-
# else:
|
39 |
-
# df.loc[i, "observation"] = mapping[str(cur_combination)]
|
40 |
-
|
41 |
-
observation_list = df["observation"].to_list()
|
42 |
-
|
43 |
-
# value_separated_observation_list = [observation_list[i - window_size: i] for i in range(window_size, len(observation_list))]
|
44 |
-
# value_separated_observation_list = [[0] * window_size] * window_size + value_separated_observation_list
|
45 |
-
|
46 |
-
observations = np.array([np.sum(np.array([train_df.iloc[j].to_list() for j in range(i-window_size, i)]).astype(int), axis=0) for i in range(window_size, len(train_df))])
|
47 |
-
|
48 |
-
observations = abs(np.min(observations)) + observations
|
49 |
-
|
50 |
-
observations = observations.astype(int)
|
51 |
-
|
52 |
-
m_observations = np.concatenate(
|
53 |
-
(np.array([observations[0].tolist()] * window_size), observations),
|
54 |
-
axis=0
|
55 |
-
)
|
56 |
-
|
57 |
-
df = pd.concat([df, pd.DataFrame({"window_observation": m_observations.tolist()})], axis=1)
|
58 |
-
|
59 |
-
hidden_markov_model = hmm.MultinomialHMM(n_components=5, n_iter=50, tol=0.01)
|
60 |
-
|
61 |
-
hidden_markov_model.fit(observations)
|
62 |
-
|
63 |
-
start_prob = hidden_markov_model.startprob_
|
64 |
-
transition_prob = hidden_markov_model.transmat_
|
65 |
-
emission_prob = hidden_markov_model.emissionprob_
|
66 |
-
|
67 |
-
neg_log_likelihood, pred = calculate_momentum(df, hidden_markov_model, m_observations)
|
68 |
-
|
69 |
-
_, hidden2observation = hidden_markov_model.score_samples(observations)
|
70 |
-
|
71 |
-
state_impacts = np.sum(hidden2observation, axis=0)
|
72 |
-
|
73 |
-
return state_impacts, neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
74 |
-
|
75 |
-
state_impacts = np.zeros((num_states, num_obs))
|
76 |
-
|
77 |
-
for t in range(num_obs):
|
78 |
-
for i in range(num_states):
|
79 |
-
state_impacts[i, t] = (forward_prob[t, i] * backward_prob[t, i]) / np.sum(
|
80 |
-
forward_prob[t, :] * backward_prob[t, :])
|
81 |
-
|
82 |
-
return neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
83 |
-
|
84 |
-
|
85 |
-
def calculate_momentum(df, hidden_markov_model, m_observations):
|
86 |
-
# pred_list = []
|
87 |
-
# neg_log_likelihood_list = []
|
88 |
-
# for i in range(len(df)):
|
89 |
-
# neg_log_likelihood, pred = hidden_markov_model.decode(np.array([df.loc[i, "window_observation"]]))
|
90 |
-
# pred_list.append(pred[0])
|
91 |
-
# neg_log_likelihood_list.append(neg_log_likelihood)
|
92 |
-
#
|
93 |
-
# return pred_list, neg_log_likelihood_list
|
94 |
-
|
95 |
-
neg_log_likelihood, pred = hidden_markov_model.decode(m_observations)
|
96 |
-
|
97 |
-
return neg_log_likelihood, pred
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/others/poly_model.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
|
4 |
-
|
5 |
-
def poly_fit(x_values, y_values, degree=60):
|
6 |
-
# 使用 numpy 的 polyfit 函数进行多项式拟合
|
7 |
-
coefficients = np.polyfit(x_values, y_values, degree)
|
8 |
-
|
9 |
-
# 生成拟合的多项式函数
|
10 |
-
fitted_curve = np.poly1d(coefficients)
|
11 |
-
|
12 |
-
return fitted_curve(x_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/poly_model.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
|
4 |
-
|
5 |
-
def poly_fit(x_values, y_values, degree=60):
|
6 |
-
# 使用 numpy 的 polyfit 函数进行多项式拟合
|
7 |
-
coefficients = np.polyfit(x_values, y_values, degree)
|
8 |
-
|
9 |
-
# 生成拟合的多项式函数
|
10 |
-
fitted_curve = np.poly1d(coefficients)
|
11 |
-
|
12 |
-
return fitted_curve(x_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/shap_model.py
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
import matplotlib.pyplot as plt
|
2 |
-
import numpy as np
|
3 |
-
import shap
|
4 |
-
|
5 |
-
|
6 |
-
def draw_shap_beeswarm(model, x, feature_names, type, paint_object):
|
7 |
-
explainer = shap.KernelExplainer(model.predict, x)
|
8 |
-
shap_values = explainer(x)
|
9 |
-
|
10 |
-
shap.summary_plot(shap_values, x, feature_names=feature_names, plot_type=type, show=False)
|
11 |
-
|
12 |
-
plt.title(paint_object.get_name())
|
13 |
-
plt.tight_layout()
|
14 |
-
|
15 |
-
return plt, paint_object
|
16 |
-
|
17 |
-
|
18 |
-
def draw_waterfall(model, x, feature_names, number, paint_object):
|
19 |
-
explainer = shap.KernelExplainer(model.predict, x, feature_names=feature_names)
|
20 |
-
shap_values = explainer(x)
|
21 |
-
|
22 |
-
shap.waterfall_plot(shap_values[number], show=False)
|
23 |
-
|
24 |
-
plt.title(paint_object.get_name())
|
25 |
-
plt.tight_layout()
|
26 |
-
|
27 |
-
return plt, paint_object
|
28 |
-
|
29 |
-
|
30 |
-
def draw_force(model, x, feature_names, number, paint_object):
|
31 |
-
explainer = shap.KernelExplainer(model.predict, x, feature_names=feature_names)
|
32 |
-
shap_values = explainer(x[number])
|
33 |
-
|
34 |
-
shap.force_plot(explainer.expected_value, shap_values.values, feature_names=feature_names, show=False, matplotlib=True)
|
35 |
-
|
36 |
-
plt.title(paint_object.get_name())
|
37 |
-
plt.tight_layout()
|
38 |
-
|
39 |
-
return plt, paint_object
|
40 |
-
|
41 |
-
|
42 |
-
def draw_dependence(model, x, feature_names, col, paint_object):
|
43 |
-
explainer = shap.KernelExplainer(model.predict, x, feature_names=feature_names)
|
44 |
-
shap_values = explainer(x)
|
45 |
-
|
46 |
-
shap.dependence_plot(feature_names.index(col), shap_values.values, x, feature_names=feature_names, show=False)
|
47 |
-
|
48 |
-
plt.title(paint_object.get_name())
|
49 |
-
plt.tight_layout()
|
50 |
-
|
51 |
-
return plt, paint_object
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/tree_model.py
DELETED
@@ -1,290 +0,0 @@
|
|
1 |
-
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
2 |
-
from sklearn.ensemble import RandomForestClassifier
|
3 |
-
from sklearn.ensemble import RandomForestRegressor
|
4 |
-
from sklearn.model_selection import learning_curve
|
5 |
-
from sklearn.tree import DecisionTreeClassifier
|
6 |
-
from xgboost import XGBClassifier
|
7 |
-
import lightgbm as lightGBMClassifier
|
8 |
-
|
9 |
-
from analysis.shap_model import *
|
10 |
-
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
11 |
-
from static.config import Config
|
12 |
-
from static.process import grid_search, bayes_search
|
13 |
-
from static.new_class import *
|
14 |
-
|
15 |
-
|
16 |
-
class RandomForestRegressionParams:
|
17 |
-
@classmethod
|
18 |
-
def get_params(cls):
|
19 |
-
return {
|
20 |
-
'n_estimators': [10, 50, 100, 200],
|
21 |
-
'max_depth': [None, 10, 20, 30],
|
22 |
-
'min_samples_split': [2, 5, 10],
|
23 |
-
'min_samples_leaf': [1, 2, 4]
|
24 |
-
}
|
25 |
-
|
26 |
-
|
27 |
-
# 随机森林回归
|
28 |
-
def random_forest_regression(container: Container):
|
29 |
-
x_train = container.x_train
|
30 |
-
y_train = container.y_train
|
31 |
-
x_test = container.x_test
|
32 |
-
y_test = container.y_test
|
33 |
-
hyper_params_optimize = container.hyper_params_optimize
|
34 |
-
info = {}
|
35 |
-
|
36 |
-
random_forest_regression_model = RandomForestRegressor(n_estimators=5, random_state=Config.RANDOM_STATE)
|
37 |
-
params = RandomForestRegressionParams.get_params()
|
38 |
-
|
39 |
-
if hyper_params_optimize == "grid_search":
|
40 |
-
best_model = grid_search(params, random_forest_regression_model, x_train, y_train)
|
41 |
-
elif hyper_params_optimize == "bayes_search":
|
42 |
-
best_model = bayes_search(params, random_forest_regression_model, x_train, y_train)
|
43 |
-
else:
|
44 |
-
best_model = random_forest_regression_model
|
45 |
-
best_model.fit(x_train, y_train)
|
46 |
-
|
47 |
-
info["参数"] = best_model.get_params()
|
48 |
-
|
49 |
-
y_pred = best_model.predict(x_test)
|
50 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
51 |
-
container.set_y_pred(y_pred)
|
52 |
-
|
53 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
54 |
-
|
55 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
56 |
-
train_scores_std = np.std(train_scores, axis=1)
|
57 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
58 |
-
test_scores_std = np.std(test_scores, axis=1)
|
59 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
60 |
-
test_scores_std)
|
61 |
-
|
62 |
-
info["指标"] = calculate_regression_metrics(y_pred, y_test)
|
63 |
-
|
64 |
-
container.set_info(info)
|
65 |
-
container.set_status("trained")
|
66 |
-
container.set_model(best_model)
|
67 |
-
|
68 |
-
return container
|
69 |
-
|
70 |
-
|
71 |
-
class DecisionTreeClassifierParams:
|
72 |
-
@classmethod
|
73 |
-
def get_params(cls):
|
74 |
-
return {
|
75 |
-
"criterion": ["gini", "entropy"],
|
76 |
-
"splitter": ["best", "random"],
|
77 |
-
"max_depth": [None, 5, 10, 15],
|
78 |
-
"min_samples_split": [2, 5, 10],
|
79 |
-
"min_samples_leaf": [1, 2, 4]
|
80 |
-
}
|
81 |
-
|
82 |
-
|
83 |
-
# 决策树分类
|
84 |
-
def decision_tree_classifier(container: Container):
|
85 |
-
x_train = container.x_train
|
86 |
-
y_train = container.y_train
|
87 |
-
x_test = container.x_test
|
88 |
-
y_test = container.y_test
|
89 |
-
hyper_params_optimize = container.hyper_params_optimize
|
90 |
-
info = {}
|
91 |
-
|
92 |
-
random_forest_regression_model = DecisionTreeClassifier(random_state=Config.RANDOM_STATE)
|
93 |
-
params = DecisionTreeClassifierParams.get_params()
|
94 |
-
|
95 |
-
if hyper_params_optimize == "grid_search":
|
96 |
-
best_model = grid_search(params, random_forest_regression_model, x_train, y_train)
|
97 |
-
elif hyper_params_optimize == "bayes_search":
|
98 |
-
best_model = bayes_search(params, random_forest_regression_model, x_train, y_train)
|
99 |
-
else:
|
100 |
-
best_model = random_forest_regression_model
|
101 |
-
best_model.fit(x_train, y_train)
|
102 |
-
|
103 |
-
info["参数"] = best_model.get_params()
|
104 |
-
|
105 |
-
y_pred = best_model.predict(x_test)
|
106 |
-
container.set_y_pred(y_pred)
|
107 |
-
|
108 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
109 |
-
|
110 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
111 |
-
train_scores_std = np.std(train_scores, axis=1)
|
112 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
113 |
-
test_scores_std = np.std(test_scores, axis=1)
|
114 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
115 |
-
test_scores_std)
|
116 |
-
|
117 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
118 |
-
|
119 |
-
container.set_info(info)
|
120 |
-
container.set_status("trained")
|
121 |
-
container.set_model(best_model)
|
122 |
-
|
123 |
-
return container
|
124 |
-
|
125 |
-
|
126 |
-
class RandomForestClassifierParams:
|
127 |
-
@classmethod
|
128 |
-
def get_params(cls):
|
129 |
-
return {
|
130 |
-
"criterion": ["gini", "entropy"],
|
131 |
-
"n_estimators": [50, 100, 150],
|
132 |
-
"max_depth": [None, 5, 10, 15],
|
133 |
-
"min_samples_split": [2, 5, 10],
|
134 |
-
"min_samples_leaf": [1, 2, 4]
|
135 |
-
}
|
136 |
-
|
137 |
-
|
138 |
-
# 随机森林分类
|
139 |
-
def random_forest_classifier(container: Container):
|
140 |
-
x_train = container.x_train
|
141 |
-
y_train = container.y_train
|
142 |
-
x_test = container.x_test
|
143 |
-
y_test = container.y_test
|
144 |
-
hyper_params_optimize = container.hyper_params_optimize
|
145 |
-
info = {}
|
146 |
-
|
147 |
-
random_forest_classifier_model = RandomForestClassifier(n_estimators=5, random_state=Config.RANDOM_STATE)
|
148 |
-
params = RandomForestClassifierParams.get_params()
|
149 |
-
|
150 |
-
if hyper_params_optimize == "grid_search":
|
151 |
-
best_model = grid_search(params, random_forest_classifier_model, x_train, y_train)
|
152 |
-
elif hyper_params_optimize == "bayes_search":
|
153 |
-
best_model = bayes_search(params, random_forest_classifier_model, x_train, y_train)
|
154 |
-
else:
|
155 |
-
best_model = random_forest_classifier_model
|
156 |
-
best_model.fit(x_train, y_train)
|
157 |
-
|
158 |
-
info["参数"] = best_model.get_params()
|
159 |
-
|
160 |
-
y_pred = best_model.predict(x_test)
|
161 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
162 |
-
container.set_y_pred(y_pred)
|
163 |
-
|
164 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
165 |
-
|
166 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
167 |
-
train_scores_std = np.std(train_scores, axis=1)
|
168 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
169 |
-
test_scores_std = np.std(test_scores, axis=1)
|
170 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
171 |
-
test_scores_std)
|
172 |
-
|
173 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
174 |
-
|
175 |
-
container.set_info(info)
|
176 |
-
container.set_status("trained")
|
177 |
-
container.set_model(best_model)
|
178 |
-
|
179 |
-
return container
|
180 |
-
|
181 |
-
|
182 |
-
class XgboostClassifierParams:
|
183 |
-
@classmethod
|
184 |
-
def get_params(cls):
|
185 |
-
return {
|
186 |
-
"n_estimators": [50, 100, 150],
|
187 |
-
"learning_rate": [0.01, 0.1, 0.2],
|
188 |
-
"max_depth": [3, 4, 5],
|
189 |
-
"min_child_weight": [1, 2, 3],
|
190 |
-
"gamma": [0, 0.1, 0.2],
|
191 |
-
"subsample": [0.5, 0.8, 0.9, 1.0],
|
192 |
-
"colsample_bytree": [0.8, 0.9, 1.0]
|
193 |
-
}
|
194 |
-
|
195 |
-
|
196 |
-
# xgboost分类
|
197 |
-
def xgboost_classifier(container: Container):
|
198 |
-
x_train = container.x_train
|
199 |
-
y_train = container.y_train
|
200 |
-
x_test = container.x_test
|
201 |
-
y_test = container.y_test
|
202 |
-
hyper_params_optimize = container.hyper_params_optimize
|
203 |
-
info = {}
|
204 |
-
|
205 |
-
xgboost_classifier_model = XGBClassifier(random_state=Config.RANDOM_STATE)
|
206 |
-
params = XgboostClassifierParams.get_params()
|
207 |
-
|
208 |
-
if hyper_params_optimize == "grid_search":
|
209 |
-
best_model = grid_search(params, xgboost_classifier_model, x_train, y_train)
|
210 |
-
elif hyper_params_optimize == "bayes_search":
|
211 |
-
best_model = bayes_search(params, xgboost_classifier_model, x_train, y_train)
|
212 |
-
else:
|
213 |
-
best_model = xgboost_classifier_model
|
214 |
-
best_model.fit(x_train, y_train)
|
215 |
-
|
216 |
-
info["参数"] = best_model.get_params()
|
217 |
-
|
218 |
-
y_pred = best_model.predict(x_test)
|
219 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
220 |
-
container.set_y_pred(y_pred)
|
221 |
-
|
222 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
223 |
-
|
224 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
225 |
-
train_scores_std = np.std(train_scores, axis=1)
|
226 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
227 |
-
test_scores_std = np.std(test_scores, axis=1)
|
228 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
229 |
-
test_scores_std)
|
230 |
-
|
231 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
232 |
-
|
233 |
-
container.set_info(info)
|
234 |
-
container.set_status("trained")
|
235 |
-
container.set_model(best_model)
|
236 |
-
|
237 |
-
return container
|
238 |
-
|
239 |
-
|
240 |
-
class LightGBMClassifierParams:
|
241 |
-
@classmethod
|
242 |
-
def get_params(cls):
|
243 |
-
return
|
244 |
-
|
245 |
-
|
246 |
-
# lightGBM分类
|
247 |
-
def lightGBM_classifier(container: Container):
|
248 |
-
x_train = container.x_train
|
249 |
-
y_train = container.y_train
|
250 |
-
x_test = container.x_test
|
251 |
-
y_test = container.y_test
|
252 |
-
hyper_params_optimize = container.hyper_params_optimize
|
253 |
-
info = {}
|
254 |
-
|
255 |
-
lightgbm_classifier_model = lightGBMClassifier
|
256 |
-
params = LightGBMClassifierParams.get_params()
|
257 |
-
|
258 |
-
if hyper_params_optimize == "grid_search":
|
259 |
-
best_model = grid_search(params, lightgbm_classifier_model, x_train, y_train)
|
260 |
-
elif hyper_params_optimize == "bayes_search":
|
261 |
-
best_model = bayes_search(params, lightgbm_classifier_model, x_train, y_train)
|
262 |
-
else:
|
263 |
-
best_model = lightgbm_classifier_model
|
264 |
-
best_model.train(x_train, y_train)
|
265 |
-
|
266 |
-
info["参数"] = best_model.get_params()
|
267 |
-
|
268 |
-
y_pred = best_model.predict(x_test)
|
269 |
-
# y_pred = best_model.predict(x_test).reshape(-1, 1)
|
270 |
-
container.set_y_pred(y_pred)
|
271 |
-
|
272 |
-
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
273 |
-
|
274 |
-
train_scores_mean = np.mean(train_scores, axis=1)
|
275 |
-
train_scores_std = np.std(train_scores, axis=1)
|
276 |
-
test_scores_mean = np.mean(test_scores, axis=1)
|
277 |
-
test_scores_std = np.std(test_scores, axis=1)
|
278 |
-
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean,
|
279 |
-
test_scores_std)
|
280 |
-
|
281 |
-
info["指标"] = calculate_classification_metrics(y_pred, y_test)
|
282 |
-
|
283 |
-
container.set_info(info)
|
284 |
-
container.set_status("trained")
|
285 |
-
container.set_model(best_model)
|
286 |
-
|
287 |
-
return container
|
288 |
-
|
289 |
-
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/two_exponential_smoothing_model.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
import matplotlib.pyplot as plt
|
2 |
-
|
3 |
-
|
4 |
-
# 双指数平滑
|
5 |
-
def double_exponential_smoothing(series, alpha, beta):
|
6 |
-
"""
|
7 |
-
series - dataset with timeseries
|
8 |
-
alpha - float [0.0, 1.0], smoothing parameter for level
|
9 |
-
beta - float [0.0, 1.0], smoothing parameter for trend
|
10 |
-
"""
|
11 |
-
# first value is same as series
|
12 |
-
result = [series[0]]
|
13 |
-
for n in range(1, len(series) + 1):
|
14 |
-
if n == 1:
|
15 |
-
level, trend = series[0], series[1] - series[0]
|
16 |
-
if n >= len(series): # forecasting
|
17 |
-
value = result[-1]
|
18 |
-
else:
|
19 |
-
value = series[n]
|
20 |
-
last_level, level = level, alpha * value + (1 - alpha) * (level + trend)
|
21 |
-
trend = beta * (level - last_level) + (1 - beta) * trend
|
22 |
-
result.append(level + trend)
|
23 |
-
return result
|
24 |
-
|
25 |
-
|
26 |
-
def plotDoubleExponentialSmoothing(series, alphas, betas):
|
27 |
-
"""
|
28 |
-
Plots double exponential smoothing with different alphas and betas
|
29 |
-
|
30 |
-
series - dataset with timestamps
|
31 |
-
alphas - list of floats, smoothing parameters for level
|
32 |
-
betas - list of floats, smoothing parameters for trend
|
33 |
-
"""
|
34 |
-
|
35 |
-
with plt.style.context('seaborn-white'):
|
36 |
-
plt.figure(figsize=(13, 5))
|
37 |
-
for alpha in alphas:
|
38 |
-
for beta in betas:
|
39 |
-
plt.plot(double_exponential_smoothing(series, alpha, beta),
|
40 |
-
label="Alpha {}, beta {}".format(alpha, beta))
|
41 |
-
plt.plot(series.values, label="Actual")
|
42 |
-
plt.legend(loc="best")
|
43 |
-
plt.axis('tight')
|
44 |
-
plt.title("Double Exponential Smoothing")
|
45 |
-
plt.grid(True)
|
46 |
-
|
47 |
-
|
48 |
-
plotDoubleExponentialSmoothing(data['trend'], alphas=[0.5, 0.3], betas=[0.9, 0.3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/process.py
CHANGED
@@ -3,6 +3,9 @@ def get_values_from_container_class(container):
|
|
3 |
|
4 |
|
5 |
def transform_params_list(params_class, params_list, model=None):
|
|
|
|
|
|
|
6 |
input_params_keys = []
|
7 |
input_params_values = []
|
8 |
inner_value_list = []
|
@@ -18,21 +21,24 @@ def transform_params_list(params_class, params_list, model=None):
|
|
18 |
inner_value_list.append(param)
|
19 |
else:
|
20 |
input_params_values.append(inner_value_list)
|
21 |
-
|
22 |
|
23 |
-
for k, v in
|
24 |
if k in keys:
|
25 |
value_type = params_class.get_params_type(model)[k] if model else params_class.get_params_type()[k]
|
26 |
try:
|
27 |
if value_type == "int":
|
28 |
-
|
29 |
elif value_type == "float":
|
30 |
-
|
31 |
elif value_type == "bool":
|
32 |
-
|
33 |
elif value_type == "str":
|
34 |
-
|
35 |
except Exception:
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
-
return
|
|
|
3 |
|
4 |
|
5 |
def transform_params_list(params_class, params_list, model=None):
|
6 |
+
# test
|
7 |
+
print("params_class: {}, params_list: {}".format(str(params_class), str(params_list)))
|
8 |
+
|
9 |
input_params_keys = []
|
10 |
input_params_values = []
|
11 |
inner_value_list = []
|
|
|
21 |
inner_value_list.append(param)
|
22 |
else:
|
23 |
input_params_values.append(inner_value_list)
|
24 |
+
params = dict(zip(input_params_keys, input_params_values))
|
25 |
|
26 |
+
for k, v in params.items():
|
27 |
if k in keys:
|
28 |
value_type = params_class.get_params_type(model)[k] if model else params_class.get_params_type()[k]
|
29 |
try:
|
30 |
if value_type == "int":
|
31 |
+
params[k] = [int(x) for x in params[k]]
|
32 |
elif value_type == "float":
|
33 |
+
params[k] = [float(x) for x in params[k]]
|
34 |
elif value_type == "bool":
|
35 |
+
params[k] = [x == "True" for x in params[k]]
|
36 |
elif value_type == "str":
|
37 |
+
params[k] = [str(x) for x in params[k]]
|
38 |
except Exception:
|
39 |
+
params[k] = [str(x) for x in params[k]]
|
40 |
+
|
41 |
+
# test
|
42 |
+
print("params: {}".format(str(params)))
|
43 |
|
44 |
+
return params
|
static/__init__.py
DELETED
File without changes
|
static/col.py
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
def get_pca_col():
|
2 |
-
return [
|
3 |
-
"p1_momentum_value_better",
|
4 |
-
"elapsed_time",
|
5 |
-
"server",
|
6 |
-
"serve_no",
|
7 |
-
"p1_ace",
|
8 |
-
"p2_ace",
|
9 |
-
"p1_winner",
|
10 |
-
"p2_winner",
|
11 |
-
"winner_shot_type",
|
12 |
-
# "p1_double_fault",
|
13 |
-
"p2_double_fault",
|
14 |
-
"p1_unf_err",
|
15 |
-
"p2_unf_err",
|
16 |
-
"p1_net_pt",
|
17 |
-
"p2_net_pt",
|
18 |
-
"p1_net_pt_won",
|
19 |
-
"p2_net_pt_won",
|
20 |
-
"p1_break_pt",
|
21 |
-
"p2_break_pt",
|
22 |
-
"p1_break_pt_won",
|
23 |
-
"p2_break_pt_won",
|
24 |
-
"p1_break_pt_missed",
|
25 |
-
"p2_break_pt_missed",
|
26 |
-
"p1_distance_run",
|
27 |
-
"p2_distance_run",
|
28 |
-
"rally_count",
|
29 |
-
"speed_mph",
|
30 |
-
"serve_width",
|
31 |
-
"serve_depth",
|
32 |
-
"return_depth"
|
33 |
-
]
|
34 |
-
|
35 |
-
|
36 |
-
def get_momentum_col(p):
|
37 |
-
return [
|
38 |
-
"point_victor",
|
39 |
-
"elapsed_time",
|
40 |
-
"server",
|
41 |
-
"serve_no",
|
42 |
-
"{}_ace".format(p),
|
43 |
-
# "p2_ace",
|
44 |
-
"{}_winner".format(p),
|
45 |
-
# "p2_winner",
|
46 |
-
"winner_shot_type",
|
47 |
-
# "p1_double_fault",
|
48 |
-
# "p2_double_fault",
|
49 |
-
"{}_unf_err".format(p),
|
50 |
-
# "p2_unf_err",
|
51 |
-
"{}_net_pt".format(p),
|
52 |
-
# "p2_net_pt",
|
53 |
-
"{}_net_pt_won".format(p),
|
54 |
-
# "p2_net_pt_won",
|
55 |
-
"{}_break_pt".format(p),
|
56 |
-
# "p2_break_pt",
|
57 |
-
"{}_break_pt_won".format(p),
|
58 |
-
# "p2_break_pt_won",
|
59 |
-
"{}_break_pt_missed".format(p),
|
60 |
-
# "p2_break_pt_missed",
|
61 |
-
"{}_distance_run".format(p),
|
62 |
-
# "p2_distance_run",
|
63 |
-
"rally_count",
|
64 |
-
"speed_mph",
|
65 |
-
"serve_width",
|
66 |
-
"serve_depth",
|
67 |
-
"return_depth"
|
68 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/config.py
DELETED
@@ -1,136 +0,0 @@
|
|
1 |
-
class Config:
|
2 |
-
# 随机种子
|
3 |
-
RANDOM_STATE = 123
|
4 |
-
|
5 |
-
# 预测图展示的点个数
|
6 |
-
DISPLAY_RANGE = 100
|
7 |
-
|
8 |
-
# 绘图颜色组
|
9 |
-
COLOR_ITER_NUM = 3
|
10 |
-
|
11 |
-
COLORS = [
|
12 |
-
"#ca5353",
|
13 |
-
"#c874a5",
|
14 |
-
"#b674c8",
|
15 |
-
"#8274c8",
|
16 |
-
"#748dc8",
|
17 |
-
"#74acc8",
|
18 |
-
"#74c8b7",
|
19 |
-
"#74c88d",
|
20 |
-
"#a6c874",
|
21 |
-
"#e0e27e",
|
22 |
-
"#df9b77",
|
23 |
-
"#404040",
|
24 |
-
"#999999",
|
25 |
-
"#d4d4d4"
|
26 |
-
] * COLOR_ITER_NUM
|
27 |
-
|
28 |
-
COLORS_0 = [
|
29 |
-
"#8074C8",
|
30 |
-
"#7895C1",
|
31 |
-
"#A8CBDF",
|
32 |
-
"#992224",
|
33 |
-
"#B54764",
|
34 |
-
"#E3625D",
|
35 |
-
"#EF8B67",
|
36 |
-
"#F0C284"
|
37 |
-
] * COLOR_ITER_NUM
|
38 |
-
|
39 |
-
COLORS_1 = [
|
40 |
-
"#4A5F7E",
|
41 |
-
"#719AAC",
|
42 |
-
"#72B063",
|
43 |
-
"#94C6CD",
|
44 |
-
"#B8DBB3",
|
45 |
-
"#E29135"
|
46 |
-
] * COLOR_ITER_NUM
|
47 |
-
|
48 |
-
COLORS_2 = [
|
49 |
-
"#4485C7",
|
50 |
-
"#D4562E",
|
51 |
-
"#DBB428",
|
52 |
-
"#682487",
|
53 |
-
"#84BA42",
|
54 |
-
"#7ABBDB",
|
55 |
-
"#A51C36"
|
56 |
-
] * COLOR_ITER_NUM
|
57 |
-
|
58 |
-
COLORS_3 = [
|
59 |
-
"#8074C8",
|
60 |
-
"#7895C1",
|
61 |
-
"#A8CBDF",
|
62 |
-
"#F5EBAE",
|
63 |
-
"#F0C284",
|
64 |
-
"#EF8B67",
|
65 |
-
"#E3625D",
|
66 |
-
"#B54764"
|
67 |
-
] * COLOR_ITER_NUM
|
68 |
-
|
69 |
-
COLORS_4 = [
|
70 |
-
"#979998",
|
71 |
-
"#C69287",
|
72 |
-
"#E79A90",
|
73 |
-
"#EFBC91",
|
74 |
-
"#E4CD87",
|
75 |
-
"#FAE5BB",
|
76 |
-
"#DDDDDF"
|
77 |
-
] * COLOR_ITER_NUM
|
78 |
-
|
79 |
-
COLORS_5 = [
|
80 |
-
"#91CCC0",
|
81 |
-
"#7FABD1",
|
82 |
-
"#F7AC53",
|
83 |
-
"#EC6E66",
|
84 |
-
"#B5CE4E",
|
85 |
-
"#BD7795",
|
86 |
-
"#7C7979"
|
87 |
-
] * COLOR_ITER_NUM
|
88 |
-
|
89 |
-
COLORS_6 = [
|
90 |
-
"#E9687A",
|
91 |
-
"#F58F7A",
|
92 |
-
"#FDE2D8",
|
93 |
-
"#CFCFD0",
|
94 |
-
"#B6B3D6"
|
95 |
-
] * COLOR_ITER_NUM
|
96 |
-
|
97 |
-
JS_0 = """
|
98 |
-
function createGradioAnimation() {
|
99 |
-
var container = document.createElement('div');
|
100 |
-
container.id = 'gradio-animation';
|
101 |
-
container.style.fontSize = '2em';
|
102 |
-
container.style.fontWeight = 'bold';
|
103 |
-
container.style.textAlign = 'center';
|
104 |
-
container.style.marginBottom = '20px';
|
105 |
-
|
106 |
-
var text = 'Welcome to EasyMachineLearning!';
|
107 |
-
for (var i = 0; i < text.length; i++) {
|
108 |
-
(function(i){
|
109 |
-
setTimeout(function(){
|
110 |
-
var letter = document.createElement('span');
|
111 |
-
letter.style.opacity = '0';
|
112 |
-
letter.style.transition = 'opacity 0.5s';
|
113 |
-
letter.innerText = text[i];
|
114 |
-
|
115 |
-
container.appendChild(letter);
|
116 |
-
|
117 |
-
setTimeout(function() {
|
118 |
-
letter.style.opacity = '1';
|
119 |
-
}, 50);
|
120 |
-
}, i * 250);
|
121 |
-
})(i);
|
122 |
-
}
|
123 |
-
|
124 |
-
var gradioContainer = document.querySelector('.gradio-container');
|
125 |
-
gradioContainer.insertBefore(container, gradioContainer.firstChild);
|
126 |
-
|
127 |
-
return 'Animation created';
|
128 |
-
}
|
129 |
-
"""
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/new_class.py
DELETED
@@ -1,195 +0,0 @@
|
|
1 |
-
class Container:
|
2 |
-
def __init__(self, x_train=None, y_train=None, x_test=None, y_test=None, hyper_params_optimize=None):
|
3 |
-
self.x_train = x_train
|
4 |
-
self.y_train = y_train
|
5 |
-
self.x_test = x_test
|
6 |
-
self.y_test = y_test
|
7 |
-
self.hyper_params_optimize = hyper_params_optimize
|
8 |
-
self.info = {"参数": {}, "指标": {}}
|
9 |
-
self.y_pred = None
|
10 |
-
self.train_sizes = None
|
11 |
-
self.train_scores_mean = None
|
12 |
-
self.train_scores_std = None
|
13 |
-
self.test_scores_mean = None
|
14 |
-
self.test_scores_std = None
|
15 |
-
self.status = None
|
16 |
-
self.model = None
|
17 |
-
|
18 |
-
def get_info(self):
|
19 |
-
return self.info
|
20 |
-
|
21 |
-
def set_info(self, info: dict):
|
22 |
-
self.info = info
|
23 |
-
|
24 |
-
def set_y_pred(self, y_pred):
|
25 |
-
self.y_pred = y_pred
|
26 |
-
|
27 |
-
def get_data_fit_values(self):
|
28 |
-
return [
|
29 |
-
self.y_pred,
|
30 |
-
self.y_test
|
31 |
-
]
|
32 |
-
|
33 |
-
def get_learning_curve_values(self):
|
34 |
-
return [
|
35 |
-
self.train_sizes,
|
36 |
-
self.train_scores_mean,
|
37 |
-
self.train_scores_std,
|
38 |
-
self.test_scores_mean,
|
39 |
-
self.test_scores_std
|
40 |
-
]
|
41 |
-
|
42 |
-
def set_learning_curve_values(self, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std):
|
43 |
-
self.train_sizes = train_sizes
|
44 |
-
self.train_scores_mean = train_scores_mean
|
45 |
-
self.train_scores_std = train_scores_std
|
46 |
-
self.test_scores_mean = test_scores_mean
|
47 |
-
self.test_scores_std = test_scores_std
|
48 |
-
|
49 |
-
def get_status(self):
|
50 |
-
return self.status
|
51 |
-
|
52 |
-
def set_status(self, status: str):
|
53 |
-
self.status = status
|
54 |
-
|
55 |
-
def get_model(self):
|
56 |
-
return self.model
|
57 |
-
|
58 |
-
def set_model(self, model):
|
59 |
-
self.model = model
|
60 |
-
|
61 |
-
|
62 |
-
class PaintObject:
|
63 |
-
def __init__(self):
|
64 |
-
self.color_cur_num = 0
|
65 |
-
self.color_cur_list = []
|
66 |
-
self.label_cur_num = 0
|
67 |
-
self.label_cur_list = []
|
68 |
-
self.x_cur_label = ""
|
69 |
-
self.y_cur_label = ""
|
70 |
-
self.name = ""
|
71 |
-
|
72 |
-
def get_color_cur_num(self):
|
73 |
-
return self.color_cur_num
|
74 |
-
|
75 |
-
def set_color_cur_num(self, color_cur_num):
|
76 |
-
self.color_cur_num = color_cur_num
|
77 |
-
|
78 |
-
def get_color_cur_list(self):
|
79 |
-
return self.color_cur_list
|
80 |
-
|
81 |
-
def set_color_cur_list(self, color_cur_list):
|
82 |
-
self.color_cur_list = color_cur_list
|
83 |
-
|
84 |
-
def get_label_cur_num(self):
|
85 |
-
return self.label_cur_num
|
86 |
-
|
87 |
-
def set_label_cur_num(self, label_cur_num):
|
88 |
-
self.label_cur_num = label_cur_num
|
89 |
-
|
90 |
-
def get_label_cur_list(self):
|
91 |
-
return self.label_cur_list
|
92 |
-
|
93 |
-
def set_label_cur_list(self, label_cur_list):
|
94 |
-
self.label_cur_list = label_cur_list
|
95 |
-
|
96 |
-
def get_x_cur_label(self):
|
97 |
-
return self.x_cur_label
|
98 |
-
|
99 |
-
def set_x_cur_label(self, x_cur_label):
|
100 |
-
self.x_cur_label = x_cur_label
|
101 |
-
|
102 |
-
def get_y_cur_label(self):
|
103 |
-
return self.y_cur_label
|
104 |
-
|
105 |
-
def set_y_cur_label(self, y_cur_label):
|
106 |
-
self.y_cur_label = y_cur_label
|
107 |
-
|
108 |
-
def get_name(self):
|
109 |
-
return self.name
|
110 |
-
|
111 |
-
def set_name(self, name):
|
112 |
-
self.name = name
|
113 |
-
|
114 |
-
|
115 |
-
class SelectModel:
|
116 |
-
def __init__(self):
|
117 |
-
self.models = None
|
118 |
-
self.waterfall_number = None
|
119 |
-
self.force_number = None
|
120 |
-
self.beeswarm_plot_type = None
|
121 |
-
self.dependence_col = None
|
122 |
-
self.data_distribution_col = None
|
123 |
-
self.data_distribution_is_rotate = None
|
124 |
-
self.descriptive_indicators_col = None
|
125 |
-
self.descriptive_indicators_is_rotate = None
|
126 |
-
self.heatmap_col = None
|
127 |
-
self.heatmap_is_rotate = None
|
128 |
-
|
129 |
-
def get_heatmap_col(self):
|
130 |
-
return self.heatmap_col
|
131 |
-
|
132 |
-
def set_heatmap_col(self, heatmap_col):
|
133 |
-
self.heatmap_col = heatmap_col
|
134 |
-
|
135 |
-
def get_heatmap_is_rotate(self):
|
136 |
-
return self.heatmap_is_rotate
|
137 |
-
|
138 |
-
def set_heatmap_is_rotate(self, heatmap_is_rotate):
|
139 |
-
self.heatmap_is_rotate = heatmap_is_rotate
|
140 |
-
|
141 |
-
def get_models(self):
|
142 |
-
return self.models
|
143 |
-
|
144 |
-
def set_models(self, models):
|
145 |
-
self.models = models
|
146 |
-
|
147 |
-
def get_waterfall_number(self):
|
148 |
-
return self.waterfall_number
|
149 |
-
|
150 |
-
def set_waterfall_number(self, waterfall_number):
|
151 |
-
self.waterfall_number = waterfall_number
|
152 |
-
|
153 |
-
def get_force_number(self):
|
154 |
-
return self.force_number
|
155 |
-
|
156 |
-
def set_force_number(self, force_number):
|
157 |
-
self.force_number = force_number
|
158 |
-
|
159 |
-
def get_beeswarm_plot_type(self):
|
160 |
-
return self.beeswarm_plot_type
|
161 |
-
|
162 |
-
def set_beeswarm_plot_type(self, beeswarm_plot_type):
|
163 |
-
self.beeswarm_plot_type = beeswarm_plot_type
|
164 |
-
|
165 |
-
def get_dependence_col(self):
|
166 |
-
return self.dependence_col
|
167 |
-
|
168 |
-
def set_dependence_col(self, dependence_col):
|
169 |
-
self.dependence_col = dependence_col
|
170 |
-
|
171 |
-
def get_data_distribution_col(self):
|
172 |
-
return self.data_distribution_col
|
173 |
-
|
174 |
-
def set_data_distribution_col(self, data_distribution_col):
|
175 |
-
self.data_distribution_col = data_distribution_col
|
176 |
-
|
177 |
-
def get_data_distribution_is_rotate(self):
|
178 |
-
return self.data_distribution_is_rotate
|
179 |
-
|
180 |
-
def set_data_distribution_is_rotate(self, data_distribution_is_rotate):
|
181 |
-
self.data_distribution_is_rotate = data_distribution_is_rotate
|
182 |
-
|
183 |
-
def get_descriptive_indicators_is_rotate(self):
|
184 |
-
return self.descriptive_indicators_is_rotate
|
185 |
-
|
186 |
-
def set_descriptive_indicators_is_rotate(self, descriptive_indicators_is_rotate):
|
187 |
-
self.descriptive_indicators_is_rotate = descriptive_indicators_is_rotate
|
188 |
-
|
189 |
-
def get_descriptive_indicators_col(self):
|
190 |
-
return self.descriptive_indicators_col
|
191 |
-
|
192 |
-
def set_descriptive_indicators_col(self, descriptive_indicators_col):
|
193 |
-
self.descriptive_indicators_col = descriptive_indicators_col
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/paint.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
class PaintObject:
|
2 |
-
def __init__(self):
|
3 |
-
self.color_cur_num = 0
|
4 |
-
self.color_cur_list = []
|
5 |
-
self.label_cur_num = 0
|
6 |
-
self.label_cur_list = []
|
7 |
-
self.x_cur_label = ""
|
8 |
-
self.y_cur_label = ""
|
9 |
-
self.name = ""
|
10 |
-
|
11 |
-
def get_color_cur_num(self):
|
12 |
-
return self.color_cur_num
|
13 |
-
|
14 |
-
def set_color_cur_num(self, color_cur_num):
|
15 |
-
self.color_cur_num = color_cur_num
|
16 |
-
|
17 |
-
def get_color_cur_list(self):
|
18 |
-
return self.color_cur_list
|
19 |
-
|
20 |
-
def set_color_cur_list(self, color_cur_list):
|
21 |
-
self.color_cur_list = color_cur_list
|
22 |
-
|
23 |
-
def get_label_cur_num(self):
|
24 |
-
return self.label_cur_num
|
25 |
-
|
26 |
-
def set_label_cur_num(self, label_cur_num):
|
27 |
-
self.label_cur_num = label_cur_num
|
28 |
-
|
29 |
-
def get_label_cur_list(self):
|
30 |
-
return self.label_cur_list
|
31 |
-
|
32 |
-
def set_label_cur_list(self, label_cur_list):
|
33 |
-
self.label_cur_list = label_cur_list
|
34 |
-
|
35 |
-
def get_x_cur_label(self):
|
36 |
-
return self.x_cur_label
|
37 |
-
|
38 |
-
def set_x_cur_label(self, x_cur_label):
|
39 |
-
self.x_cur_label = x_cur_label
|
40 |
-
|
41 |
-
def get_y_cur_label(self):
|
42 |
-
return self.y_cur_label
|
43 |
-
|
44 |
-
def set_y_cur_label(self, y_cur_label):
|
45 |
-
self.y_cur_label = y_cur_label
|
46 |
-
|
47 |
-
def get_name(self):
|
48 |
-
return self.name
|
49 |
-
|
50 |
-
def set_name(self, name):
|
51 |
-
self.name = name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/process.py
DELETED
@@ -1,326 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from sklearn.model_selection import train_test_split
|
3 |
-
from sklearn.model_selection import KFold
|
4 |
-
from sklearn import preprocessing
|
5 |
-
from sklearn.model_selection import GridSearchCV
|
6 |
-
from skopt import BayesSearchCV
|
7 |
-
import copy
|
8 |
-
import pandas as pd
|
9 |
-
from scipy.stats import spearmanr
|
10 |
-
from io import StringIO
|
11 |
-
from contextlib import redirect_stdout
|
12 |
-
|
13 |
-
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_diabetes
|
14 |
-
from scipy.linalg import eig
|
15 |
-
|
16 |
-
from static.config import Config
|
17 |
-
|
18 |
-
|
19 |
-
def match_split(df: pd.DataFrame):
|
20 |
-
return df.groupby("match_id")
|
21 |
-
|
22 |
-
|
23 |
-
# 斯皮尔曼秩相关系数
|
24 |
-
def calculate_spearmanr(x, y):
|
25 |
-
rho, p_value = spearmanr(x, y)
|
26 |
-
|
27 |
-
return rho, p_value
|
28 |
-
|
29 |
-
|
30 |
-
def calculate_remain_positive_points(df: pd.DataFrame):
|
31 |
-
# remain_positive距离无限远设置为len(df)
|
32 |
-
|
33 |
-
df["p1_remain_positive"] = 0
|
34 |
-
df["p2_remain_positive"] = 0
|
35 |
-
p1_zero_distance_list = []
|
36 |
-
p2_zero_distance_list = []
|
37 |
-
|
38 |
-
for i in range(1, len(df)):
|
39 |
-
if (df.loc[i, "p1_momentum_value_better"] > 0
|
40 |
-
and i != 0):
|
41 |
-
p1_zero_distance_list.append(i)
|
42 |
-
elif (df.loc[i, "p1_momentum_value_better"] < 0
|
43 |
-
and i != 0):
|
44 |
-
p2_zero_distance_list.append(i)
|
45 |
-
|
46 |
-
for j in range(len(df)):
|
47 |
-
for x in p1_zero_distance_list:
|
48 |
-
if j <= x:
|
49 |
-
df.loc[j, "p1_remain_positive"] = x - j
|
50 |
-
break
|
51 |
-
else:
|
52 |
-
continue
|
53 |
-
|
54 |
-
for j in range(len(df)):
|
55 |
-
for x in p2_zero_distance_list:
|
56 |
-
if j <= x:
|
57 |
-
df.loc[j, "p2_remain_positive"] = x - j
|
58 |
-
break
|
59 |
-
else:
|
60 |
-
continue
|
61 |
-
|
62 |
-
return df
|
63 |
-
|
64 |
-
|
65 |
-
def calculate_swing_point(df:pd.DataFrame):
|
66 |
-
# swing距离无限远设置为len(df)
|
67 |
-
|
68 |
-
df["swing"] = 0
|
69 |
-
zero_distance_list = []
|
70 |
-
|
71 |
-
for i in range(1, len(df)):
|
72 |
-
if (df.loc[i, "p1_momentum_value_better"] > 0 and df.loc[i-1, "p1_momentum_value_better"] < 0
|
73 |
-
and i != 0) or (df.loc[i, "p1_momentum_value_better"] < 0 and df.loc[i - 1, "p1_momentum_value_better"] > 0
|
74 |
-
and i != 0):
|
75 |
-
zero_distance_list.append(i)
|
76 |
-
|
77 |
-
for j in range(len(df)):
|
78 |
-
for x in zero_distance_list:
|
79 |
-
if j <= x:
|
80 |
-
df.loc[j, "swing"] = x - j
|
81 |
-
break
|
82 |
-
else:
|
83 |
-
continue
|
84 |
-
|
85 |
-
return df
|
86 |
-
|
87 |
-
|
88 |
-
def replace_na_to_label(df: pd.DataFrame):
|
89 |
-
return df.fillna("Not A Number")
|
90 |
-
|
91 |
-
|
92 |
-
def get_state_distribution(data):
|
93 |
-
# get the matrix of correlation coefficients
|
94 |
-
covX = np.around(np.corrcoef(data.T), decimals=3)
|
95 |
-
|
96 |
-
# draw_heat_map(covX, "related", False)
|
97 |
-
|
98 |
-
# Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
|
99 |
-
eigenvalues, eigenvectors = np.linalg.eig(covX.T)
|
100 |
-
|
101 |
-
eigenvalues = np.around(eigenvalues, decimals=3)
|
102 |
-
|
103 |
-
eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
|
104 |
-
|
105 |
-
# Sort feature values in descending order
|
106 |
-
eigenvalues = sorted(eigenvalues, reverse=True)
|
107 |
-
|
108 |
-
for i, value in enumerate(eigenvalues):
|
109 |
-
if i == 0:
|
110 |
-
sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
|
111 |
-
else:
|
112 |
-
sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
|
113 |
-
|
114 |
-
# draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
|
115 |
-
|
116 |
-
# get the contribution of the eigenvalues
|
117 |
-
contribution = eigenvalues / np.sum(eigenvalues)
|
118 |
-
|
119 |
-
return contribution
|
120 |
-
|
121 |
-
|
122 |
-
# 指数加权平均
|
123 |
-
def exponential_moving_average(df):
|
124 |
-
alpha = 0.3
|
125 |
-
|
126 |
-
ema = [df[0]]
|
127 |
-
|
128 |
-
for i in range(1, len(df)):
|
129 |
-
ema_value = alpha * df[i] + (1 - alpha) * ema[i-1]
|
130 |
-
ema.append(ema_value)
|
131 |
-
|
132 |
-
return ema
|
133 |
-
|
134 |
-
|
135 |
-
def need_to_mark_in_plot(df, col_name):
|
136 |
-
return df.where(df[col_name] == 1).dropna()
|
137 |
-
|
138 |
-
|
139 |
-
def point_victor_mapping(df):
|
140 |
-
mapping = {
|
141 |
-
1: 0.0,
|
142 |
-
2: 1.0
|
143 |
-
}
|
144 |
-
df["point_victor"] = df["point_victor"].map(mapping)
|
145 |
-
|
146 |
-
return df
|
147 |
-
|
148 |
-
|
149 |
-
def pick_matches_with_name(df, name):
|
150 |
-
df = df.where(df["match_id"] == name).dropna()
|
151 |
-
|
152 |
-
p1_name = df["player1"].iloc[0]
|
153 |
-
p2_name = df["player2"].iloc[0]
|
154 |
-
|
155 |
-
return df, p1_name, p2_name
|
156 |
-
|
157 |
-
|
158 |
-
def pick_matches_with_longest(df):
|
159 |
-
target_match_id = df.groupby("match_id").size().idxmax()
|
160 |
-
|
161 |
-
df = df.where(df["match_id"] == target_match_id).dropna()
|
162 |
-
|
163 |
-
p1_name = df["player1"].iloc[0]
|
164 |
-
p2_name = df["player2"].iloc[0]
|
165 |
-
|
166 |
-
return df, p1_name, p2_name
|
167 |
-
|
168 |
-
|
169 |
-
def choose_y_col_in_dataframe(df: pd.DataFrame, y_col: str):
|
170 |
-
y_data = df[y_col]
|
171 |
-
df.drop(y_col, axis=1, inplace=True)
|
172 |
-
df.insert(0, y_col, y_data)
|
173 |
-
|
174 |
-
return df
|
175 |
-
|
176 |
-
|
177 |
-
def load_data(sort):
|
178 |
-
type = ""
|
179 |
-
if sort == "Iris Dataset":
|
180 |
-
sk_data = load_iris()
|
181 |
-
type = "classification"
|
182 |
-
elif sort == "Wine Dataset":
|
183 |
-
sk_data = load_wine()
|
184 |
-
type = "classification"
|
185 |
-
elif sort == "Breast Cancer Dataset":
|
186 |
-
sk_data = load_breast_cancer()
|
187 |
-
type = "classification"
|
188 |
-
elif sort == "Diabetes Dataset":
|
189 |
-
sk_data = load_diabetes()
|
190 |
-
type = "regression"
|
191 |
-
elif sort == "California Housing Dataset":
|
192 |
-
df = pd.read_csv("./data/fetch_california_housing.csv")
|
193 |
-
return df
|
194 |
-
else:
|
195 |
-
sk_data = load_iris()
|
196 |
-
type = "classification"
|
197 |
-
|
198 |
-
if type == "classification":
|
199 |
-
target_data = sk_data.target.astype(str)
|
200 |
-
for i in range(len(sk_data.target_names)):
|
201 |
-
target_data = np.where(target_data == str(i), sk_data.target_names[i], target_data)
|
202 |
-
else:
|
203 |
-
target_data = sk_data.target
|
204 |
-
|
205 |
-
feature_names = sk_data.feature_names
|
206 |
-
sk_feature_names = ["target"] + feature_names.tolist() if isinstance(feature_names, np.ndarray) else ["target"] + feature_names
|
207 |
-
sk_data = np.concatenate((target_data.reshape(-1, 1), sk_data.data), axis=1)
|
208 |
-
|
209 |
-
df = pd.DataFrame(data=sk_data, columns=sk_feature_names)
|
210 |
-
|
211 |
-
return df
|
212 |
-
|
213 |
-
|
214 |
-
def load_custom_data(file):
|
215 |
-
if "xlsx" in file or "xls" in file:
|
216 |
-
return pd.read_excel(file)
|
217 |
-
elif "csv" in file:
|
218 |
-
return pd.read_csv(file)
|
219 |
-
|
220 |
-
|
221 |
-
def preprocess_raw_data_filtering(df):
|
222 |
-
info = {}
|
223 |
-
|
224 |
-
len_0 = len(df)
|
225 |
-
info["Total size of raw data"] = len_0
|
226 |
-
|
227 |
-
# Delete the column "CUSTOMER_ID"
|
228 |
-
# df.drop("CUSTOMER_ID", axis=1, inplace=True)
|
229 |
-
|
230 |
-
# Remove duplicate data
|
231 |
-
df.drop_duplicates()
|
232 |
-
len_1 = len_0 - len(df)
|
233 |
-
info["Number of duplicates in the raw data"] = len_1
|
234 |
-
|
235 |
-
# Remove "nan" data
|
236 |
-
# df = remove_nan_from_data(df)
|
237 |
-
# len_2 = len_0 - len_1 - len(df)
|
238 |
-
# info["Number of nan in the raw data"] = len_2
|
239 |
-
|
240 |
-
info["Total size of filtered data after data preprocessing"] = len(df)
|
241 |
-
|
242 |
-
# Save the cleaned data to a csv format file
|
243 |
-
# df.to_csv("../data/filtered_data.csv", index=False)
|
244 |
-
|
245 |
-
return df, info
|
246 |
-
|
247 |
-
|
248 |
-
def remove_nan_from_data(df):
|
249 |
-
# Remove "nan" data
|
250 |
-
df.dropna(inplace=True)
|
251 |
-
|
252 |
-
return df
|
253 |
-
|
254 |
-
|
255 |
-
# Get standardized data
|
256 |
-
def get_standardized_data(df):
|
257 |
-
array = np.concatenate(((df.iloc[:, :1]).values, preprocessing.scale(df.iloc[:, 1:])), axis=1)
|
258 |
-
|
259 |
-
return array
|
260 |
-
|
261 |
-
|
262 |
-
def split_dataset(array):
|
263 |
-
x_train_and_validate, x_test, y_train_and_validate, y_test = train_test_split(
|
264 |
-
array[:, 1:],
|
265 |
-
array[:, :1],
|
266 |
-
random_state=Config.RANDOM_STATE,
|
267 |
-
train_size=0.8
|
268 |
-
)
|
269 |
-
|
270 |
-
return x_train_and_validate, x_test, y_train_and_validate, y_test
|
271 |
-
|
272 |
-
|
273 |
-
def k_fold_cross_validation_data_segmentation(x_train, y_train):
|
274 |
-
k = 5
|
275 |
-
|
276 |
-
train_data_array = np.concatenate((y_train, x_train), axis=1)
|
277 |
-
|
278 |
-
k_fold = KFold(n_splits=k, shuffle=True, random_state=Config.RANDOM_STATE)
|
279 |
-
|
280 |
-
train_data_list = []
|
281 |
-
validate_data_list = []
|
282 |
-
for train_index, validate_index in k_fold.split(train_data_array):
|
283 |
-
train_data_list.append(train_data_array[train_index])
|
284 |
-
validate_data_list.append(train_data_array[validate_index])
|
285 |
-
|
286 |
-
train_and_validate_data_list = []
|
287 |
-
|
288 |
-
for i in range(k):
|
289 |
-
train_and_validate_data_list.append((
|
290 |
-
train_data_list[i][:, 1:],
|
291 |
-
validate_data_list[i][:, 1:],
|
292 |
-
train_data_list[i][:, 0],
|
293 |
-
validate_data_list[i][:, 0]
|
294 |
-
))
|
295 |
-
|
296 |
-
return train_and_validate_data_list
|
297 |
-
|
298 |
-
|
299 |
-
def grid_search(params, model, x_train, y_train, scoring=None):
|
300 |
-
info = {}
|
301 |
-
|
302 |
-
grid_search_model = GridSearchCV(model, params, cv=3, n_jobs=-1)
|
303 |
-
|
304 |
-
grid_search_model.fit(x_train, y_train.ravel())
|
305 |
-
|
306 |
-
info["Optimal hyperparameters"] = grid_search_model.best_params_
|
307 |
-
|
308 |
-
best_model = grid_search_model.best_estimator_
|
309 |
-
|
310 |
-
return best_model
|
311 |
-
|
312 |
-
|
313 |
-
def bayes_search(params, model, x_train, y_train, scoring=None):
|
314 |
-
info = {}
|
315 |
-
|
316 |
-
bayes_search_model = BayesSearchCV(model, params, cv=3, n_iter=50, n_jobs=-1)
|
317 |
-
|
318 |
-
bayes_search_model.fit(x_train, y_train)
|
319 |
-
|
320 |
-
info["Optimal hyperparameters"] = bayes_search_model.best_params_
|
321 |
-
|
322 |
-
best_model = bayes_search_model.best_estimator_
|
323 |
-
|
324 |
-
return best_model
|
325 |
-
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|