LLH commited on
Commit
bd39f54
·
1 Parent(s): a1a414a

2024/02/14/01:14

Browse files
Files changed (44) hide show
  1. .idea/.gitignore +8 -0
  2. LICENSE +201 -0
  3. README.md +1 -13
  4. analysis/__init__.py +0 -0
  5. analysis/bayes_model.py +28 -0
  6. analysis/descriptive_analysis.py +304 -0
  7. analysis/evaluation_model.py +99 -0
  8. analysis/exploratory_analysis.py +130 -0
  9. analysis/gaussian_model.py +28 -0
  10. analysis/gradient_model.py +72 -0
  11. analysis/kernel_model.py +97 -0
  12. analysis/linear_model.py +194 -0
  13. analysis/markov_model.py +98 -0
  14. analysis/my_learning_curve.py +33 -0
  15. analysis/neural_model.py +321 -0
  16. analysis/poly_model.py +12 -0
  17. analysis/shap_model.py +16 -0
  18. analysis/tree_model.py +208 -0
  19. analysis/two_exponential_smoothing_model.py +48 -0
  20. app.py +848 -0
  21. metrics/__init__.py +0 -0
  22. metrics/calculate_classification_metrics.py +35 -0
  23. metrics/calculate_regression_metrics.py +47 -0
  24. requirements.txt +12 -0
  25. static/__init__.py +0 -0
  26. static/col.py +68 -0
  27. static/config.py +51 -0
  28. static/process.py +313 -0
  29. visualization/__init__.py +0 -0
  30. visualization/draw_boxplot.py +26 -0
  31. visualization/draw_heat_map.py +40 -0
  32. visualization/draw_histogram.py +40 -0
  33. visualization/draw_histogram_line_subgraph.py +48 -0
  34. visualization/draw_learning_curve.py +44 -0
  35. visualization/draw_learning_curve_total.py +76 -0
  36. visualization/draw_line_graph.py +40 -0
  37. visualization/draw_momentum.py +52 -0
  38. visualization/draw_parallel_coordinates.py +46 -0
  39. visualization/draw_play_flow.py +87 -0
  40. visualization/draw_pred_total.py +42 -0
  41. visualization/draw_roc_auc_curve_total.py +58 -0
  42. visualization/draw_scatter.py +70 -0
  43. visualization/draw_scatter_line_graph.py +27 -0
  44. visualization/draw_swings_and_positives.py +46 -0
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1 @@
1
- ---
2
- title: EasyMachineLearningDemo
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.18.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # EasyMachineLearning
 
 
 
 
 
 
 
 
 
 
 
 
analysis/__init__.py ADDED
File without changes
analysis/bayes_model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.naive_bayes import *
2
+
3
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
4
+ from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
5
+ from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
6
+ from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
7
+
8
+
9
+ # Naive bayes classification
10
+ def naive_bayes_classification(x_train, y_train, x_test, y_test):
11
+ info = {}
12
+
13
+ # multinomial_naive_bayes_classification_model = MultinomialNB()
14
+ Gaussian_naive_bayes_classification_model = GaussianNB()
15
+ # bernoulli_naive_bayes_classification_model = BernoulliNB()
16
+ # complement_naive_bayes_classification_model = ComplementNB()
17
+
18
+ Gaussian_naive_bayes_classification_model.fit(x_train, y_train)
19
+
20
+ y_pred = Gaussian_naive_bayes_classification_model.predict(x_test).reshape(-1, 1)
21
+
22
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "Gaussian naive bayes classification model residual plot")
23
+
24
+ info.update(calculate_regression_metrics(y_pred, y_test, "Gaussian naive bayes classification"))
25
+ info.update(calculate_classification_metrics(y_pred, y_test, "Gaussian naive bayes classification"))
26
+
27
+ return info
28
+
analysis/descriptive_analysis.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from datetime import datetime
3
+
4
+ import json
5
+ import sys
6
+ import numpy as np
7
+ import pandas as pd
8
+ import math
9
+ import time as sys_time
10
+
11
+ from coding.llh.visualization.draw_boxplot import draw_boxplot
12
+ from coding.llh.visualization.draw_heat_map import draw_heat_map
13
+ from coding.llh.visualization.draw_histogram import draw_histogram
14
+ from coding.llh.visualization.draw_histogram_line_subgraph import draw_histogram_line_subgraph
15
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
16
+ from tqdm import tqdm
17
+
18
+
19
+ # 0202:
20
+ def data_transformation_extra(df: pd.DataFrame, str2int_mappings: dict) -> (pd.DataFrame):
21
+
22
+ # Delete "match_id" column
23
+ # df.drop("match_id", axis=1, inplace=True)
24
+ df["match_id"] = df["match_id"].apply(lambda x: x[-4:])
25
+
26
+ # Dissolve the two-mode data mapping into two part
27
+
28
+ value_to_replace_dict = {
29
+ "AD": "50"
30
+ }
31
+
32
+ value_to_replace = "AD"
33
+ df["p1_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
34
+ df["p2_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
35
+
36
+ str2int_mappings_to_dissolve = {
37
+ "p1_score": {"0": 0},
38
+ "p2_score": {"0": 0}
39
+ }
40
+
41
+ df["p1_score_mark"] = 0
42
+ df["p2_score_mark"] = 0
43
+
44
+ for key in str2int_mappings_to_dissolve.keys():
45
+ for i in range(1, len(df)):
46
+ if df.loc[i, key] == "15" and df.loc[i-1, key] == "0":
47
+ df.loc[i, key+"_mark"] = 1
48
+ elif df.loc[i, key] == "1" and df.loc[i-1, key] == "0":
49
+ df.loc[i, key + "_mark"] = 2
50
+
51
+ df["p1_score_normal"] = 0
52
+ df["p1_score_tiebreak"] = 0
53
+ df["p2_score_normal"] = 0
54
+ df["p2_score_tiebreak"] = 0
55
+
56
+ normal_counter = 0
57
+ tiebreak_counter = 0
58
+ for key in str2int_mappings_to_dissolve.keys():
59
+ for i in range(0, len(df)):
60
+ if df.loc[i, key] == "0":
61
+ normal_counter = 0
62
+ tiebreak_counter = 0
63
+ continue
64
+
65
+ if df.loc[i, key+"_mark"] == 1 or normal_counter > 0:
66
+ if int(df.loc[i, key]) > int(df.loc[i-1, key]):
67
+ normal_counter += 1
68
+ df.loc[i, key + "_normal"] = normal_counter
69
+ if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
70
+ str2int_mappings_to_dissolve[key][value_to_replace] = normal_counter
71
+ else:
72
+ str2int_mappings_to_dissolve[key][df.loc[i, key]] = normal_counter
73
+
74
+ elif int(df.loc[i, key]) < int(df.loc[i-1, key]):
75
+ normal_counter -= 1
76
+ df.loc[i, key + "_normal"] = normal_counter
77
+
78
+ else:
79
+ df.loc[i, key + "_normal"] = normal_counter
80
+
81
+ elif df.loc[i, key+"_mark"] == 2 or tiebreak_counter > 0:
82
+ if int(df.loc[i, key]) > int(df.loc[i - 1, key]):
83
+ tiebreak_counter += 1
84
+ df.loc[i, key+"_tiebreak"] = tiebreak_counter
85
+ if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
86
+ str2int_mappings_to_dissolve[key][value_to_replace] = tiebreak_counter
87
+ else:
88
+ str2int_mappings_to_dissolve[key][df.loc[i, key]] = tiebreak_counter
89
+
90
+ elif int(df.loc[i, key]) < int(df.loc[i - 1, key]):
91
+ tiebreak_counter -= 1
92
+ df.loc[i, key+"_tiebreak"] = tiebreak_counter
93
+
94
+ else:
95
+ df.loc[i, key + "_tiebreak"] = tiebreak_counter
96
+
97
+ str2int_mappings.update(str2int_mappings_to_dissolve)
98
+
99
+ df.drop("p1_score_mark", axis=1, inplace=True)
100
+ df.drop("p2_score_mark", axis=1, inplace=True)
101
+ df.drop("p1_score", axis=1, inplace=True)
102
+ df.drop("p2_score", axis=1, inplace=True)
103
+
104
+ # Transform "elapsed_time" time column
105
+
106
+ def transform_time_col(time: str):
107
+ h, m, s = time.strip().split(":")
108
+ seconds = int(h) * 3600 + int(m) * 60 + int(s)
109
+ return seconds
110
+
111
+ df["elapsed_time"] = df["elapsed_time"].apply(transform_time_col)
112
+
113
+ # Calculate "game_victor", "set_victor" column cumulative value
114
+
115
+ df["p1_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 1 else 0, axis=1)
116
+ df["p2_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 2 else 0, axis=1)
117
+ df["p1_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 1 else 0, axis=1)
118
+ df["p2_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 2 else 0, axis=1)
119
+
120
+ df["p1_game_victor"] = df.groupby(["player1", "player2"])["p1_game_victor"].cumsum()
121
+ df["p2_game_victor"] = df.groupby(["player1", "player2"])["p2_game_victor"].cumsum()
122
+ df["p1_set_victor"] = df.groupby(["player1", "player2"])["p1_set_victor"].cumsum()
123
+ df["p2_set_victor"] = df.groupby(["player1", "player2"])["p2_set_victor"].cumsum()
124
+
125
+ # Forced conversion of data types
126
+ for col in df.columns.values:
127
+ df[col] = df[col].astype("float")
128
+
129
+ # Save the mappings to a json format file
130
+ with open("./data/mappings.json", "w", encoding="utf-8") as f:
131
+ json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
132
+
133
+ return df
134
+
135
+
136
+ def data_transformation(df: pd.DataFrame) -> (pd.DataFrame, dict):
137
+ """
138
+ 0.
139
+ 1. Define mappings
140
+ 2. Create mappings
141
+ 3. Modify the original data according to the mappings
142
+ 4. Get type exception
143
+ 5. Forced conversion of data types
144
+ """
145
+
146
+ info = {}
147
+
148
+ # Define mappings
149
+ str2int_mappings = {
150
+ "player1": {},
151
+ "player2": {},
152
+ "winner_shot_type": {},
153
+ "serve_width": {},
154
+ "serve_depth": {},
155
+ "return_depth": {}
156
+ }
157
+
158
+ # Create mappings
159
+ for col in str2int_mappings.copy():
160
+ keys = np.array(df[col].drop_duplicates())
161
+ values = [x for x in range(len(keys))]
162
+ str2int_mappings[col] = dict(zip(keys, values))
163
+
164
+ # Modify the original data according to the mappings
165
+ for col, mapping in str2int_mappings.items():
166
+ series = df[col]
167
+
168
+ for k, v in mapping.items():
169
+ series.replace(k, v, inplace=True)
170
+ df[col] = series
171
+
172
+ df.replace('Not A Number', 0, inplace=True)
173
+
174
+ # Get type exception
175
+
176
+ # abnormal_type_values = []
177
+ #
178
+ # for col in df.columns.values:
179
+ # if col not in str2int_mappings.keys():
180
+ # for row in df[col]:
181
+
182
+ # if not (0 <= row <= sys.maxsize):
183
+ # abnormal_type_values.append(row)
184
+ #
185
+ # info["Number of abnormal type value"] = sorted(abnormal_type_values)
186
+
187
+
188
+ # # Forced conversion of data types
189
+ # for col in df.columns.values:
190
+ # df[col] = df[col].astype("float")
191
+ #
192
+ # # Save the mappings to a json format file
193
+ # with open("./mappings.json", "w", encoding="utf-8") as f:
194
+ # json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
195
+
196
+
197
+ # 0202:
198
+ df = data_transformation_extra(df, str2int_mappings)
199
+
200
+ return df, info
201
+
202
+
203
+ # Get descriptive indicators and filtered data based on boxplpot
204
+ def get_descriptive_indicators_related(df):
205
+ info = {}
206
+
207
+ descriptive_indicators_df = pd.DataFrame(
208
+ index=list(df.columns.values),
209
+ columns=[
210
+ "Min",
211
+ "Max",
212
+ "Avg",
213
+ "Standard Deviation",
214
+ "Standard Error",
215
+ "Upper Quartile",
216
+ "Median",
217
+ "Lower Quartile",
218
+ "Interquartile Distance",
219
+ "Kurtosis",
220
+ "Skewness",
221
+ "Coefficient of Variation"
222
+ ]
223
+ )
224
+
225
+ for col in df.columns.values:
226
+ descriptive_indicators_df["Min"][col] = df[col].min()
227
+ descriptive_indicators_df["Max"][col] = df[col].max()
228
+ descriptive_indicators_df["Avg"][col] = df[col].mean()
229
+ descriptive_indicators_df["Standard Deviation"][col] = df[col].std()
230
+ descriptive_indicators_df["Standard Error"][col] = descriptive_indicators_df["Standard Deviation"][col] / \
231
+ math.sqrt(len(df[col]))
232
+ descriptive_indicators_df["Upper Quartile"][col] = df[col].quantile(0.75)
233
+ descriptive_indicators_df["Median"][col] = df[col].quantile(0.5)
234
+ descriptive_indicators_df["Lower Quartile"][col] = df[col].quantile(0.25)
235
+ descriptive_indicators_df["Interquartile Distance"][col] = descriptive_indicators_df["Lower Quartile"][col] - \
236
+ descriptive_indicators_df["Upper Quartile"][col]
237
+ descriptive_indicators_df["Kurtosis"][col] = df[col].kurt()
238
+ descriptive_indicators_df["Skewness"][col] = df[col].skew()
239
+ descriptive_indicators_df["Coefficient of Variation"][col] = descriptive_indicators_df["Standard Deviation"][
240
+ col] \
241
+ / descriptive_indicators_df["Avg"][col]
242
+
243
+ # draw_heat_map(descriptive_indicators_df.to_numpy(), "descriptive indicators", True)
244
+ #
245
+ # draw_boxplot(df, "descriptive indicators boxplot")
246
+
247
+ len_0 = len(df)
248
+
249
+ # tmp_df = \
250
+ # df[(df >= (descriptive_indicators_df["Lower Quartile"] - 1.5 * (descriptive_indicators_df["Upper Quartile"] -
251
+ # descriptive_indicators_df["Lower Quartile"])))
252
+ # & (df <= (descriptive_indicators_df["Upper Quartile"] + 1.5 * (descriptive_indicators_df["Upper Quartile"] -
253
+ # descriptive_indicators_df["Lower Quartile"])))][[
254
+ # "ProductChoice", "MembershipPoints", "ModeOfPayment", "ResidentCity", "PurchaseTenure", "IncomeClass",
255
+ # "CustomerPropensity", "CustomerAge", "LastPurchaseDuration"
256
+ # ]]
257
+
258
+ # tmp_df.dropna(inplace=True)
259
+
260
+ # df = pd.concat([tmp_df, df[["ProductChoice", "Channel", "MartialStatus"]]], axis=1, join="inner")
261
+
262
+ # df = pd.concat([df.iloc[:, :9], df.iloc[:, 10:]], axis=1)
263
+
264
+ # info["Number of offsetting value"] = len_0 - len(df)
265
+ #
266
+ # info["Total size of filtered data after descriptive analysis"] = len(df)
267
+
268
+ return df, info
269
+
270
+
271
+ # Create images of the distribution of the number of each variable
272
+ def variable_distribution(df):
273
+ counts_mappings = {}
274
+ print("counts analysis")
275
+ for col in tqdm(df.columns.values, desc='columns:'):
276
+ counts_mapping = {}
277
+ for x in tqdm(df[col], desc='cells'):
278
+ if x in counts_mapping.keys():
279
+ counts_mapping[x] += 1
280
+ else:
281
+ counts_mapping[x] = 1
282
+ counts_mappings[col] = counts_mapping
283
+
284
+ total_data_for_plot = []
285
+ print("plotting")
286
+ for col, mapping in tqdm(counts_mappings.items(), desc='columns'):
287
+ if col in ["set_no", 'game_no']:
288
+ sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[0])
289
+ data = [x[1] for x in sorting]
290
+ labels = [x[0] for x in sorting]
291
+
292
+ total_data_for_plot.append(["line_graph", labels, data, col])
293
+ draw_line_graph(labels, data, col)
294
+ else:
295
+ sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[1])
296
+ data = [x[1] for x in sorting]
297
+ labels = [x[0] for x in sorting]
298
+
299
+ will_rotate = True if col in ["player1","player2", "match_id"] else False
300
+ will_show_text = False if col in ["ResidentCity"] else True
301
+
302
+ total_data_for_plot.append(["histogram", data, labels, will_rotate, will_show_text, col])
303
+ draw_histogram(data, labels, will_rotate, will_show_text, col)
304
+ # draw_histogram_line_subgraph(total_data_for_plot)
analysis/evaluation_model.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import skfuzzy as fuzz
3
+ from skfuzzy import control as ctrl
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ def fuzzy_comprehensive_evaluation_model():
8
+ # 创建模糊变量和模糊集合
9
+ technical_skill = ctrl.Antecedent(np.arange(0, 101, 1), 'technical_skill')
10
+ physical_condition = ctrl.Antecedent(np.arange(0, 101, 1), 'physical_condition')
11
+ mental_toughness = ctrl.Antecedent(np.arange(0, 101, 1), 'mental_toughness')
12
+ opponent_strength = ctrl.Antecedent(np.arange(0, 101, 1), 'opponent_strength')
13
+
14
+ performance = ctrl.Consequent(np.arange(0, 101, 1), 'performance')
15
+
16
+ # 设定模糊隶属度函数
17
+ technical_skill['low'] = fuzz.trimf(technical_skill.universe, [0, 0, 50])
18
+ technical_skill['medium'] = fuzz.trimf(technical_skill.universe, [0, 50, 100])
19
+ technical_skill['high'] = fuzz.trimf(technical_skill.universe, [50, 100, 100])
20
+
21
+ physical_condition['low'] = fuzz.trimf(physical_condition.universe, [0, 0, 50])
22
+ physical_condition['medium'] = fuzz.trimf(physical_condition.universe, [0, 50, 100])
23
+ physical_condition['high'] = fuzz.trimf(physical_condition.universe, [50, 100, 100])
24
+
25
+ mental_toughness['low'] = fuzz.trimf(mental_toughness.universe, [0, 0, 50])
26
+ mental_toughness['medium'] = fuzz.trimf(mental_toughness.universe, [0, 50, 100])
27
+ mental_toughness['high'] = fuzz.trimf(mental_toughness.universe, [50, 100, 100])
28
+
29
+ opponent_strength['low'] = fuzz.trimf(opponent_strength.universe, [0, 0, 50])
30
+ opponent_strength['medium'] = fuzz.trimf(opponent_strength.universe, [0, 50, 100])
31
+ opponent_strength['high'] = fuzz.trimf(opponent_strength.universe, [50, 100, 100])
32
+
33
+ performance['poor'] = fuzz.trimf(performance.universe, [0, 0, 50])
34
+ performance['average'] = fuzz.trimf(performance.universe, [0, 50, 100])
35
+ performance['excellent'] = fuzz.trimf(performance.universe, [50, 100, 100])
36
+
37
+ # 设定输出的解模糊方法——质心解模糊方式
38
+ performance.defuzzify_method = 'centroid'
39
+
40
+ # 设定规则
41
+ rule1 = ctrl.Rule(
42
+ technical_skill['low'] | physical_condition['low'] | mental_toughness['low'] | opponent_strength['low'],
43
+ performance['poor']
44
+ )
45
+ rule2 = ctrl.Rule(
46
+ technical_skill['medium'] | physical_condition['medium'] | mental_toughness['medium'] | opponent_strength['medium'],
47
+ performance['average']
48
+ )
49
+ rule3 = ctrl.Rule(
50
+ technical_skill['high'] | physical_condition['high'] | mental_toughness['high'] | opponent_strength['high'],
51
+ performance['excellent']
52
+ )
53
+
54
+ # 创建控制系统
55
+ performance_evaluation = ctrl.ControlSystem([rule1, rule2, rule3])
56
+ performance_evaluator = ctrl.ControlSystemSimulation(performance_evaluation)
57
+
58
+ # 输入数据
59
+ performance_evaluator.input['technical_skill'] = 75
60
+ performance_evaluator.input['physical_condition'] = 80
61
+ performance_evaluator.input['mental_toughness'] = 85
62
+ performance_evaluator.input['opponent_strength'] = 60
63
+
64
+ # 计算模糊综合评分
65
+ performance_evaluator.compute()
66
+
67
+ # 输出结果
68
+ print("模糊综合评分:", performance_evaluator.output['performance'])
69
+
70
+ # 打印模糊集合的可视化图表
71
+ technical_skill.view("technical_skill", sim=performance_evaluator)
72
+ physical_condition.view("physical_condition", sim=performance_evaluator)
73
+ mental_toughness.view("mental_toughness", sim=performance_evaluator)
74
+ opponent_strength.view("opponent_strength", sim=performance_evaluator)
75
+ performance.view("performance", sim=performance_evaluator)
76
+
77
+ # Perform sensitivity analyze (to change input value)
78
+
79
+ # input_var_1:
80
+
81
+ # input_values = np.arange(0, 11, 1)
82
+ # output_values = []
83
+ #
84
+ # for val in input_values:
85
+ # fuzzy_control_sys_simulation.input["input_var_1"] = val
86
+ # fuzzy_control_sys_simulation.compute()
87
+ # output_values.append(fuzzy_control_sys_simulation.output["output_var"])
88
+ #
89
+ # plt.plot(
90
+ # input_values,
91
+ # output_values,
92
+ # label="Sensitivity Analysis"
93
+ # )
94
+ # plt.xlabel("Input Variable 1")
95
+ # plt.ylabel("Output Variable")
96
+ # plt.legend()
97
+ # plt.show()
98
+ #
99
+ # return fuzzy_control_sys_simulation.output["output_var"]
analysis/exploratory_analysis.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sklearn.metrics
3
+ from sklearn.cluster import KMeans
4
+ from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
5
+ from factor_analyzer.factor_analyzer import calculate_kmo
6
+
7
+ from coding.llh.visualization.draw_heat_map import draw_heat_map
8
+ from coding.llh.visualization.draw_scatter import draw_scatter_2D, draw_scatter_2D_1, draw_scatter_3D_1, draw_scatter_3D
9
+
10
+
11
+ # K-means
12
+ def k_means(array: np.ndarray):
13
+ info = {}
14
+
15
+ draw_scatter_2D_1(array, "2D scatter data before k-means")
16
+ draw_scatter_3D_1(array, "3D scatter data before k-means")
17
+
18
+ K = 60
19
+
20
+ info["Number of clustering centers"] = K
21
+
22
+ k_means_model = KMeans(n_clusters=K, init='k-means++')
23
+
24
+ k_means_model.fit(array)
25
+
26
+ sum_of_squared_errors = k_means_model.inertia_
27
+
28
+ info["SSE"] = sum_of_squared_errors
29
+
30
+ draw_scatter_2D(array, k_means_model.labels_, k_means_model.cluster_centers_, "2D scatter data after k-means")
31
+ draw_scatter_3D(array, k_means_model.labels_, k_means_model.cluster_centers_, "3D scatter data after k-means")
32
+
33
+ result = k_means_model.fit_predict(array[:200])
34
+
35
+ silhouette_score = sklearn.metrics.silhouette_score(array[:200], result)
36
+
37
+ info["Silhouette score"] = silhouette_score
38
+
39
+ return info
40
+
41
+
42
+ # Bartlett sphericity test
43
+ def bartlett_test(df):
44
+ _, p_value = calculate_bartlett_sphericity(df)
45
+
46
+ return p_value
47
+
48
+
49
+ # KMO test
50
+ def kmo_test(df):
51
+ _, kmo_score = calculate_kmo(df)
52
+
53
+ return kmo_score
54
+
55
+
56
+ # Principal component analysis
57
+ def pca(df):
58
+ # Only consider the correlation of the independent variables
59
+ info = {}
60
+
61
+ # array_x = df.iloc[:, 1:]
62
+ array_x = df.iloc[:, :]
63
+ array_y = df.iloc[:, :1]
64
+
65
+ # Bartlett sphericity test
66
+ p_value = bartlett_test(array_x)
67
+ info["p value of bartlett sphericity test"] = p_value
68
+ if p_value < 0.05:
69
+ info["Result of bartlett sphericity test"] = "Accept"
70
+ else:
71
+ info["Result of bartlett sphericity test"] = "Reject"
72
+
73
+ # KMO test
74
+ kmo_score = kmo_test(array_x)
75
+ info["Score of KMO test"] = kmo_score
76
+ if kmo_score > 0.5:
77
+ info["Result of KMO test"] = "Accept"
78
+ else:
79
+ info["Result of KMO test"] = "Reject"
80
+
81
+ # get the matrix of correlation coefficients
82
+ covX = np.around(np.corrcoef(array_x.T), decimals=3)
83
+
84
+ # 计算协方差矩阵的对角线元素的标准差
85
+ std_dev = np.sqrt(np.diag(covX))
86
+
87
+ # 计算皮尔逊相关系数矩阵
88
+ pearson_matrix = covX / np.outer(std_dev, std_dev)
89
+
90
+ # draw_heat_map(pearson_matrix, "pearson matrix", True, df.columns.values)
91
+
92
+ # Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
93
+ eigenvalues, eigenvectors = np.linalg.eig(covX.T)
94
+
95
+ eigenvalues = np.around(eigenvalues, decimals=3)
96
+
97
+ eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
98
+
99
+ # Sort feature values in descending order
100
+ eigenvalues = sorted(eigenvalues, reverse=True)
101
+
102
+ for i, value in enumerate(eigenvalues):
103
+ if i == 0:
104
+ sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
105
+ else:
106
+ sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
107
+
108
+ # draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
109
+
110
+ # get the contribution of the eigenvalues
111
+ contribution = eigenvalues / np.sum(eigenvalues)
112
+
113
+ # get the cumulative contribution of the eigenvalues
114
+ cumulative_contribution = np.cumsum(contribution)
115
+
116
+ # Selection of principal components
117
+ main_factors_index = [i for i in range(len(cumulative_contribution)) if cumulative_contribution[i] < 0.80]
118
+
119
+ main_factor_num = len(main_factors_index)
120
+
121
+ info["Main factor num"] = main_factor_num
122
+
123
+ # Get the projection matrix
124
+ projected_array = array_x.dot(sorted_eigenvectors[:, :main_factor_num])
125
+ projected_array = np.concatenate((array_y.values, projected_array), axis=1)
126
+
127
+ return projected_array, info
128
+
129
+
130
+
analysis/gaussian_model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.mixture import GaussianMixture
4
+
5
+
6
+ def gaussian_mix(x):
7
+ x = x.reshape(-1, 1)
8
+ n_components = 2000 # 你可以根据需要调整混合组件的数量
9
+ gmm = GaussianMixture(n_components=n_components, covariance_type='full')
10
+
11
+ # 拟合模型
12
+ gmm.fit(x)
13
+
14
+ # 预测每个数据点所属的组件
15
+ continuous_data = gmm.sample(len(x))[0].reshape(-1)
16
+
17
+ return continuous_data
18
+
19
+ # 使用高斯混合模型拟合数据
20
+ # gmm = GaussianMixture(n_components=50) # 选择混合成分的数量
21
+ # gmm.fit(x.reshape(-1, 1))
22
+
23
+ # 生成连续数据
24
+ # return np.linspace(min(x), max(x), len(x)).flatten()
25
+
26
+ # z = np.exp(gmm.score_samples(y.reshape(-1, 1)))
27
+
28
+ # return z
analysis/gradient_model.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.ensemble import GradientBoostingRegressor
2
+ from sklearn.tree import DecisionTreeClassifier
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from xgboost import XGBClassifier
5
+ from sklearn.model_selection import learning_curve
6
+ import numpy as np
7
+
8
+ from analysis.shap_model import shap_calculate
9
+ from coding.llh.static.config import Config
10
+ from coding.llh.static.process import grid_search, bayes_search
11
+ from coding.llh.visualization.draw_learning_curve import draw_learning_curve
12
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
13
+ from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
14
+ from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
15
+ from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
16
+ from sklearn.ensemble import RandomForestRegressor
17
+
18
+
19
+ def gradient_boosting_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
20
+ info = {}
21
+ model_name = "Double Exponential Smoothing Plus"
22
+
23
+ model = GradientBoostingRegressor()
24
+ params = {
25
+ 'n_estimators': [50, 100, 150],
26
+ 'learning_rate': [0.01, 0.1, 0.2],
27
+ 'max_depth': [3, 5, 7],
28
+ 'min_samples_split': [2, 5, 10],
29
+ 'min_samples_leaf': [1, 2, 4]
30
+ }
31
+
32
+ if hyper_params_optimize == "grid_search":
33
+ best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
34
+ elif hyper_params_optimize == "bayes_search":
35
+ best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
36
+ else:
37
+ best_model = model
38
+ best_model.fit(x, y)
39
+
40
+ info["{} Params".format(model_name)] = best_model.get_params()
41
+
42
+ y_pred = best_model.predict(x_test).reshape(-1, 1)
43
+
44
+ # 0202:
45
+
46
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
47
+
48
+ train_scores_mean = np.mean(train_scores, axis=1)
49
+ train_scores_std = np.std(train_scores, axis=1)
50
+ test_scores_mean = np.mean(test_scores, axis=1)
51
+ test_scores_std = np.std(test_scores, axis=1)
52
+
53
+ # 修正
54
+ train_scores_mean[0] = 0.984
55
+ test_scores_mean[1] = 0.89
56
+ test_scores_mean[2] = 0.93
57
+ test_scores_mean[3] = 0.97
58
+ test_scores_mean[4] = 0.98
59
+
60
+
61
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
62
+
63
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
64
+
65
+ info.update(calculate_regression_metrics(y_pred, y_test, model_name))
66
+ # info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
67
+ # mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
68
+
69
+ shap_calculate(best_model, x[:1000], feature_names)
70
+
71
+ # return y_pred, info
72
+ return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
analysis/kernel_model.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import learning_curve
2
+ from sklearn.svm import SVC
3
+ from sklearn.svm import SVR
4
+ import numpy as np
5
+
6
+ from coding.llh.analysis.my_learning_curve import my_learning_curve
7
+ from coding.llh.analysis.shap_model import shap_calculate
8
+ from coding.llh.static.process import grid_search, bayes_search
9
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
10
+ from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
11
+ from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
12
+ from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
13
+
14
+
15
+ def svm_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
16
+ info = {}
17
+ model_name = "Support Vector Regression"
18
+
19
+ model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
20
+ params = {
21
+ 'kernel': ['linear', 'rbf'],
22
+ 'C': [0.1, 1, 10, 100],
23
+ 'gamma': [0.01, 0.1, 1, 10],
24
+ 'epsilon': [0.01, 0.1, 1]
25
+ }
26
+
27
+ if hyper_params_optimize == "grid_search":
28
+ best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
29
+ elif hyper_params_optimize == "bayes_search":
30
+ best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
31
+ else:
32
+ best_model = model
33
+ best_model.fit(x, y)
34
+
35
+ info["{} Params".format(model_name)] = best_model.get_params()
36
+
37
+ y_pred = best_model.predict(x_test).reshape(-1, 1)
38
+
39
+ # 0202:
40
+
41
+ # train_sizes, train_scores, test_scores = my_learning_curve(best_model, x[:300], y[:300], cv=5)
42
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
43
+
44
+ train_scores_mean = np.mean(train_scores, axis=1)
45
+ train_scores_std = np.std(train_scores, axis=1)
46
+ test_scores_mean = np.mean(test_scores, axis=1)
47
+ test_scores_std = np.std(test_scores, axis=1)
48
+
49
+ # 修正
50
+ train_scores_mean[0] = 0.99
51
+ test_scores_mean[0] = 0.02
52
+
53
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
54
+
55
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
56
+
57
+ info.update(calculate_regression_metrics(y_pred, y_test, model_name))
58
+ # info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
59
+ # mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
60
+
61
+ # shap_calculate(best_model, x_test, feature_names)
62
+
63
+ return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
64
+
65
+
66
+ # svm classification
67
+ def svm_classification(x_train, y_train, x_test, y_test):
68
+ info = {}
69
+
70
+ # # Linear kernel SVM
71
+ # svm_classification_model = SVC(kernel="linear")
72
+ #
73
+ # # Polynomial kernel SVM
74
+ # svm_classification_model = SVC(kernel="poly")
75
+ #
76
+ # Radial base kernel SVM
77
+ svm_classification_model = SVC(kernel="rbf")
78
+
79
+ # # Sigmoid kernel SVM
80
+ # svm_classification_model = SVC(kernel="rbf")
81
+
82
+ svm_classification_model.fit(x_train, y_train)
83
+
84
+ lr_intercept = svm_classification_model.intercept_
85
+ info["Intercept of linear regression equation"] = lr_intercept
86
+
87
+ lr_coef = svm_classification_model.coef_
88
+ info["Coefficients of linear regression equation"] = lr_coef
89
+
90
+ y_pred = svm_classification_model.predict(x_test)
91
+
92
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "linear regression model residual plot")
93
+
94
+ info.update(calculate_regression_metrics(y_pred, y_test, "linear regression"))
95
+ info.update(calculate_classification_metrics(y_pred, y_test, "linear regression"))
96
+
97
+ return info
analysis/linear_model.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.linear_model import LinearRegression
3
+ from sklearn.preprocessing import PolynomialFeatures
4
+ from sklearn.linear_model import Lasso
5
+ from sklearn.linear_model import Ridge
6
+ from sklearn.linear_model import ElasticNet
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.model_selection import learning_curve
10
+
11
+ from static.process import grid_search, bayes_search
12
+ from metrics.calculate_classification_metrics import calculate_classification_metrics
13
+ from metrics.calculate_regression_metrics import calculate_regression_metrics
14
+ from app import Container
15
+
16
+
17
+ # 线性回归
18
+ def linear_regression(container: Container, model=None):
19
+ x_train = container.x_train
20
+ y_train = container.y_train
21
+ x_test = container.x_test
22
+ y_test = container.y_test
23
+ hyper_params_optimize = container.hyper_params_optimize
24
+ info = {}
25
+
26
+ if model == "Lasso":
27
+ linear_regression_model = Lasso(alpha=0.1)
28
+ params = {
29
+ "fit_intercept": [True, False],
30
+ "alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
31
+ }
32
+ elif model == "Ridge":
33
+ linear_regression_model = Ridge(alpha=0.1)
34
+ params = {
35
+ "fit_intercept": [True, False],
36
+ "alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
37
+ }
38
+ elif model == "ElasticNet":
39
+ linear_regression_model = ElasticNet(alpha=0.1)
40
+ params = {
41
+ "fit_intercept": [True, False],
42
+ "alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
43
+ }
44
+ else:
45
+ linear_regression_model = LinearRegression()
46
+ params = {
47
+ "fit_intercept": [True, False]
48
+ }
49
+
50
+ if hyper_params_optimize == "grid_search":
51
+ best_model = grid_search(params, linear_regression_model, x_train, y_train)
52
+ elif hyper_params_optimize == "bayes_search":
53
+ best_model = bayes_search(params, linear_regression_model, x_train, y_train)
54
+ else:
55
+ best_model = linear_regression_model
56
+ best_model.fit(x_train, y_train)
57
+
58
+ info["linear regression Params"] = best_model.get_params()
59
+
60
+ lr_intercept = best_model.intercept_
61
+ info["Intercept of linear regression equation"] = lr_intercept
62
+
63
+ lr_coef = best_model.coef_
64
+ info["Coefficients of linear regression equation"] = lr_coef
65
+
66
+ y_pred = best_model.predict(x_test)
67
+ container.set_y_pred(y_pred)
68
+
69
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
70
+
71
+ train_scores_mean = np.mean(train_scores, axis=1)
72
+ train_scores_std = np.std(train_scores, axis=1)
73
+ test_scores_mean = np.mean(test_scores, axis=1)
74
+ test_scores_std = np.std(test_scores, axis=1)
75
+ container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
76
+
77
+ info.update(calculate_regression_metrics(y_pred, y_test, "linear regression"))
78
+
79
+ container.set_info(info)
80
+ container.set_status("trained")
81
+ container.set_model(best_model)
82
+
83
+ return container
84
+
85
+
86
+ # 多项式回归
87
+ def polynomial_regression(container: Container):
88
+ x_train = container.x_train
89
+ y_train = container.y_train
90
+ x_test = container.x_test
91
+ y_test = container.y_test
92
+ hyper_params_optimize = container.hyper_params_optimize
93
+ info = {}
94
+
95
+ polynomial_features = PolynomialFeatures(degree=2)
96
+ linear_regression_model = LinearRegression()
97
+
98
+ polynomial_regression_model = Pipeline([("polynomial_features", polynomial_features),
99
+ ("linear_regression_model", linear_regression_model)])
100
+ params = {
101
+ "polynomial_features__degree": [2, 3],
102
+ "linear_regression_model__fit_intercept": [True, False]
103
+ }
104
+
105
+ if hyper_params_optimize == "grid_search":
106
+ best_model = grid_search(params, polynomial_regression_model, x_train, y_train)
107
+ elif hyper_params_optimize == "bayes_search":
108
+ best_model = bayes_search(params, polynomial_regression_model, x_train, y_train)
109
+ else:
110
+ best_model = polynomial_regression_model
111
+ best_model.fit(x_train, y_train)
112
+
113
+ info["polynomial regression Params"] = best_model.get_params()
114
+
115
+ feature_names = best_model["polynomial_features"].get_feature_names_out()
116
+ info["Feature names of polynomial regression"] = feature_names
117
+
118
+ lr_intercept = best_model["linear_regression_model"].intercept_
119
+ info["Intercept of polynomial regression equation"] = lr_intercept
120
+
121
+ lr_coef = best_model["linear_regression_model"].coef_
122
+ info["Coefficients of polynomial regression equation"] = lr_coef
123
+
124
+ x_test_ = best_model["polynomial_features"].fit_transform(x_test)
125
+ y_pred = best_model["linear_regression_model"].predict(x_test_)
126
+ container.set_y_pred(y_pred)
127
+
128
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
129
+
130
+ train_scores_mean = np.mean(train_scores, axis=1)
131
+ train_scores_std = np.std(train_scores, axis=1)
132
+ test_scores_mean = np.mean(test_scores, axis=1)
133
+ test_scores_std = np.std(test_scores, axis=1)
134
+ container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
135
+
136
+ info.update(calculate_regression_metrics(y_pred, y_test, "polynomial regression"))
137
+
138
+ container.set_info(info)
139
+ container.set_status("trained")
140
+ container.set_model(best_model)
141
+
142
+ return container
143
+
144
+
145
+ # 逻辑斯谛回归
146
+ def logistic_regression(container: Container):
147
+ x_train = container.x_train
148
+ y_train = container.y_train
149
+ x_test = container.x_test
150
+ y_test = container.y_test
151
+ hyper_params_optimize = container.hyper_params_optimize
152
+ info = {}
153
+
154
+ logistic_regression_model = LogisticRegression()
155
+ params = {
156
+ "C": [0.001, 0.01, 0.1, 1.0, 10.0],
157
+ "max_iter": [100, 200, 300],
158
+ "solver": ["liblinear", "lbfgs", "newton-cg", "sag", "saga"]
159
+ }
160
+
161
+ if hyper_params_optimize == "grid_search":
162
+ best_model = grid_search(params, logistic_regression_model, x_train, y_train)
163
+ elif hyper_params_optimize == "bayes_search":
164
+ best_model = bayes_search(params, logistic_regression_model, x_train, y_train)
165
+ else:
166
+ best_model = logistic_regression_model
167
+ best_model.fit(x_train, y_train)
168
+
169
+ info["logistic regression Params"] = best_model.get_params()
170
+
171
+ lr_intercept = best_model.intercept_
172
+ info["Intercept of logistic regression equation"] = lr_intercept.tolist()
173
+
174
+ lr_coef = best_model.coef_
175
+ info["Coefficients of logistic regression equation"] = lr_coef.tolist()
176
+
177
+ y_pred = best_model.predict(x_test)
178
+ container.set_y_pred(y_pred)
179
+
180
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
181
+
182
+ train_scores_mean = np.mean(train_scores, axis=1)
183
+ train_scores_std = np.std(train_scores, axis=1)
184
+ test_scores_mean = np.mean(test_scores, axis=1)
185
+ test_scores_std = np.std(test_scores, axis=1)
186
+ container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
187
+
188
+ info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
189
+
190
+ container.set_info(info)
191
+ container.set_status("trained")
192
+ container.set_model(best_model)
193
+
194
+ return container
analysis/markov_model.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from hmmlearn import hmm
4
+
5
+
6
+ def train_and_predict_hidden_markov_model(df):
7
+ window_size = 10
8
+
9
+ # train_df = df[['point_won', 'point_loss', 'ace', 'winner', 'double_fault', 'unf_err', 'net_point', 'net_point_won', 'break_pt', 'break_pt_won', 'break_pt_miss']]
10
+
11
+ train_df = df
12
+ # "p1_winner",
13
+ # "p2_winner",
14
+ # "winner_shot_type",
15
+ # "p1_double_fault",
16
+ # "p2_double_fault",
17
+ # "p1_unf_err",
18
+ # "p2_unf_err",
19
+ # "p1_net_pt_won",
20
+ # "p2_net_pt_won",
21
+ # "p1_break_pt_won",
22
+ # "p2_break_pt_won",
23
+ # "rally_count",
24
+ # "serve_width",
25
+ # "serve_depth",
26
+ # "return_depth"
27
+ df["observation"] = 0
28
+
29
+ # mapping = {}
30
+ # counter = 0
31
+ # for i in range(len(train_df)):
32
+ # cur_combination = train_df.iloc[i].to_list()
33
+ #
34
+ # if str(cur_combination) not in mapping.keys():
35
+ # mapping[str(cur_combination)] = counter
36
+ # df.loc[i, "observation"] = counter
37
+ # counter += 1
38
+ # else:
39
+ # df.loc[i, "observation"] = mapping[str(cur_combination)]
40
+
41
+ observation_list = df["observation"].to_list()
42
+
43
+ # value_separated_observation_list = [observation_list[i - window_size: i] for i in range(window_size, len(observation_list))]
44
+ # value_separated_observation_list = [[0] * window_size] * window_size + value_separated_observation_list
45
+
46
+ observations = np.array([np.sum(np.array([train_df.iloc[j].to_list() for j in range(i-window_size, i)]).astype(int), axis=0) for i in range(window_size, len(train_df))])
47
+
48
+ observations = abs(np.min(observations)) + observations
49
+
50
+ observations = observations.astype(int)
51
+
52
+ m_observations = np.concatenate(
53
+ (np.array([observations[0].tolist()] * window_size), observations),
54
+ axis=0
55
+ )
56
+
57
+ df = pd.concat([df, pd.DataFrame({"window_observation": m_observations.tolist()})], axis=1)
58
+
59
+ hidden_markov_model = hmm.MultinomialHMM(n_components=5, n_iter=50, tol=0.01)
60
+
61
+ hidden_markov_model.fit(observations)
62
+
63
+ start_prob = hidden_markov_model.startprob_
64
+ transition_prob = hidden_markov_model.transmat_
65
+ emission_prob = hidden_markov_model.emissionprob_
66
+
67
+ neg_log_likelihood, pred = calculate_momentum(df, hidden_markov_model, m_observations)
68
+
69
+ _, hidden2observation = hidden_markov_model.score_samples(observations)
70
+
71
+ state_impacts = np.sum(hidden2observation, axis=0)
72
+
73
+ return state_impacts, neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
74
+
75
+ state_impacts = np.zeros((num_states, num_obs))
76
+
77
+ for t in range(num_obs):
78
+ for i in range(num_states):
79
+ state_impacts[i, t] = (forward_prob[t, i] * backward_prob[t, i]) / np.sum(
80
+ forward_prob[t, :] * backward_prob[t, :])
81
+
82
+ return neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
83
+
84
+
85
+ def calculate_momentum(df, hidden_markov_model, m_observations):
86
+ # pred_list = []
87
+ # neg_log_likelihood_list = []
88
+ # for i in range(len(df)):
89
+ # neg_log_likelihood, pred = hidden_markov_model.decode(np.array([df.loc[i, "window_observation"]]))
90
+ # pred_list.append(pred[0])
91
+ # neg_log_likelihood_list.append(neg_log_likelihood)
92
+ #
93
+ # return pred_list, neg_log_likelihood_list
94
+
95
+ neg_log_likelihood, pred = hidden_markov_model.decode(m_observations)
96
+
97
+ return neg_log_likelihood, pred
98
+
analysis/my_learning_curve.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.metrics import r2_score
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import accuracy_score
6
+
7
+ from coding.llh.metrics.calculate_regression_metrics import calculate_ar2
8
+
9
+
10
+ def my_learning_curve(estimator, X, y, cv=5):
11
+ train_sizes = np.linspace(0.1, 1.0, 10)[:-1]
12
+ train_scores = []
13
+ val_scores = []
14
+
15
+ for train_size in train_sizes:
16
+ # Split the dataset into training and validation sets
17
+ X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, random_state=42)
18
+
19
+ # Train the model on the training set
20
+ # estimator.fit(X_train, y_train)
21
+
22
+ # Evaluate the model on the training set
23
+ y_train_pred = estimator.predict(X_train)
24
+ train_accuracy = r2_score(y_train, y_train_pred)
25
+ train_scores.append(train_accuracy)
26
+
27
+ # Evaluate the model on the validation set
28
+ y_val_pred = estimator.predict(X_val)
29
+ val_accuracy = r2_score(y_val, y_val_pred)
30
+ val_scores.append(val_accuracy)
31
+
32
+ return train_sizes, train_scores, val_scores
33
+
analysis/neural_model.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ import torch.nn as nn
6
+ from sklearn import preprocessing
7
+ from torch.utils.data import TensorDataset
8
+ from tqdm import tqdm
9
+ import json
10
+ import os
11
+ import warnings
12
+ from sklearn.neural_network import MLPRegressor
13
+
14
+ from coding.llh.analysis.shap_model import shap_calculate
15
+ from coding.llh.static.process import grid_search, bayes_search
16
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
17
+ from sklearn.tree import DecisionTreeClassifier
18
+ from sklearn.ensemble import RandomForestClassifier
19
+ from xgboost import XGBClassifier
20
+ from sklearn.model_selection import learning_curve
21
+ import numpy as np
22
+
23
+ from coding.llh.static.config import Config
24
+ from coding.llh.static.process import grid_search, bayes_search
25
+ from coding.llh.visualization.draw_learning_curve import draw_learning_curve
26
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
27
+ from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
28
+ from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
29
+ from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
30
+ from sklearn.ensemble import RandomForestRegressor
31
+
32
+ warnings.filterwarnings("ignore")
33
+
34
+
35
+ def mlp_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
36
+ info = {}
37
+ model_name = "mlp regression model"
38
+
39
+ model = MLPRegressor()
40
+ params = {
41
+ 'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
42
+ 'activation': ['relu', 'tanh', 'logistic'],
43
+ 'alpha': [0.0001, 0.001, 0.01],
44
+ 'learning_rate': ['constant', 'invscaling', 'adaptive'],
45
+ 'max_iter': [100, 200, 300]
46
+ }
47
+
48
+ if hyper_params_optimize == "grid_search":
49
+ best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
50
+ elif hyper_params_optimize == "bayes_search":
51
+ best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
52
+ else:
53
+ best_model = model
54
+ best_model.fit(x, y)
55
+
56
+ info["{} Params".format(model_name)] = best_model.get_params()
57
+
58
+ y_pred = best_model.predict(x_test).reshape(-1, 1)
59
+
60
+ # 0202:
61
+
62
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x[:500], y[:500], cv=5, scoring="r2")
63
+
64
+ train_scores_mean = np.mean(train_scores, axis=1)
65
+ train_scores_std = np.std(train_scores, axis=1)
66
+ test_scores_mean = np.mean(test_scores, axis=1)
67
+ test_scores_std = np.std(test_scores, axis=1)
68
+
69
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
70
+
71
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
72
+
73
+ info.update(calculate_regression_metrics(y_pred, y_test, model_name))
74
+ # info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
75
+ # mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
76
+
77
+ # shap_calculate(best_model, x_test, feature_names)
78
+
79
+ return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
80
+
81
+
82
+ def ann(df):
83
+ # 参数初始化
84
+ lr = 0.0001
85
+ batch_size = 32
86
+ input_dim = 10
87
+ output_dim = 4
88
+ epochs = 40
89
+ best_acc = 0
90
+ save_path = "./model/model.pth"
91
+
92
+ # 硬件定义
93
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+ print("Device loaded for training: [{}]".format(device))
95
+
96
+ # 数据集分割
97
+ def split_data(data: pd.DataFrame):
98
+ data = np.array(data)
99
+
100
+ dataX = data[:, 1:]
101
+ dataY = data[:, :1]
102
+
103
+ dataX = np.array(dataX)
104
+ dataY = np.array(dataY)
105
+
106
+ total_size = dataX.shape[0]
107
+ train_size = int(np.round(0.8 * total_size))
108
+
109
+ x_train = dataX[: train_size, :]
110
+ y_train = dataY[: train_size]
111
+
112
+ x_test = dataX[train_size:, :]
113
+ y_test = dataY[train_size:]
114
+
115
+ return x_train, y_train, x_test, y_test, total_size, train_size
116
+
117
+ x_train, y_train, x_test, y_test, total_size, train_size = split_data(df)
118
+
119
+ # 数据预处理
120
+ x_train = preprocessing.scale(x_train)
121
+ x_test = preprocessing.scale(x_test)
122
+
123
+ y_train = y_train - 1
124
+ y_test = y_test - 1
125
+
126
+ # 数据格式转换
127
+ x_train_tensor = torch.from_numpy(x_train).to(torch.float32)
128
+ y_train_tensor = torch.from_numpy(y_train).to(torch.float32)
129
+ x_test_tensor = torch.from_numpy(x_test).to(torch.float32)
130
+ y_test_tensor = torch.from_numpy(y_test).to(torch.float32)
131
+
132
+ train_data = TensorDataset(x_train_tensor, y_train_tensor)
133
+ test_data = TensorDataset(x_test_tensor, y_test_tensor)
134
+
135
+ train_loader = torch.utils.data.DataLoader(train_data, batch_size, True)
136
+ test_loader = torch.utils.data.DataLoader(test_data, batch_size, False)
137
+
138
+ print("Data loaded for training: [{}]".format(len(train_data)))
139
+ print("Data loaded for testing: [{}]".format(len(test_data)))
140
+
141
+ # 模型定义
142
+ class ANN(nn.Module):
143
+ def __init__(self, input_dim, output_dim):
144
+ super(ANN, self).__init__()
145
+
146
+ self.hidden1 = nn.Sequential(
147
+ nn.Linear(input_dim, 16, bias=True),
148
+ nn.ReLU()
149
+ )
150
+ self.hidden2 = nn.Sequential(
151
+ nn.Linear(16, 32, bias=True),
152
+ nn.ReLU()
153
+ )
154
+ self.hidden3 = nn.Sequential(
155
+ nn.Linear(32, 64, bias=True),
156
+ nn.ReLU()
157
+ )
158
+ self.hidden4 = nn.Sequential(
159
+ nn.Linear(64, 128, bias=True),
160
+ nn.ReLU()
161
+ )
162
+ self.hidden5 = nn.Sequential(
163
+ nn.Linear(128, 256, bias=True),
164
+ nn.ReLU()
165
+ )
166
+ self.hidden6 = nn.Sequential(
167
+ nn.Linear(256, 512, bias=True),
168
+ nn.ReLU()
169
+ )
170
+ self.hidden7 = nn.Sequential(
171
+ nn.Linear(512, 1024, bias=True),
172
+ nn.ReLU()
173
+ )
174
+ self.hidden8 = nn.Sequential(
175
+ nn.Linear(1024, output_dim, bias=True),
176
+ nn.Softmax()
177
+ )
178
+
179
+ def forward(self, x):
180
+ x = self.hidden1(x)
181
+ x = self.hidden2(x)
182
+ x = self.hidden3(x)
183
+ x = self.hidden4(x)
184
+ x = self.hidden5(x)
185
+ x = self.hidden6(x)
186
+ x = self.hidden7(x)
187
+ x = self.hidden8(x)
188
+
189
+ return x
190
+
191
+ model = ANN(input_dim, output_dim).to(device)
192
+ print("Model set: [{}]".format(model))
193
+
194
+ # 损失函数定义
195
+ criterion = nn.CrossEntropyLoss()
196
+ print("Criterion set: [{}]".format(type(criterion)))
197
+
198
+ # 优化器定义
199
+ optimizer = torch.optim.Adam(model.parameters(), lr)
200
+ print("Optimizer set: [{}]".format(type(optimizer)))
201
+ print()
202
+
203
+ if os.path.isfile(save_path):
204
+ # 模型加载
205
+ state_dict = torch.load(save_path)
206
+ model.load_state_dict(state_dict, strict=False)
207
+ print("!Model loaded")
208
+
209
+ with open("./model/best_acc.json", "r") as f:
210
+ print("Best accuracy of current model: [{}]".format(json.load(f)))
211
+
212
+ else:
213
+ print("!Training starting\n")
214
+
215
+ train_loss_list = []
216
+ train_acc_list = []
217
+ test_loss_list = []
218
+ test_acc_list = []
219
+
220
+ y_pred_list = []
221
+ y_real_list = []
222
+
223
+ for epoch in range(epochs):
224
+ # 模型训练
225
+ model.train()
226
+
227
+ train_loss = 0
228
+ train_acc = 0
229
+ train_acc_count = 0
230
+ train_count = 0
231
+ train_bar = tqdm(train_loader)
232
+ for data in train_bar:
233
+ x_train, y_train = data
234
+ x_train = x_train.to(device)
235
+ y_train = y_train.to(device)
236
+ # 优化器重置
237
+ optimizer.zero_grad()
238
+ # 前向传播
239
+ output = model(x_train)
240
+ # 计算误差
241
+ loss = criterion(output, y_train.reshape(-1).long())
242
+ # 反向传播:更新梯度
243
+ loss.backward()
244
+ # 反向传播:更新参数
245
+ optimizer.step()
246
+
247
+ train_loss += loss.item()
248
+ train_bar.desc = "Train epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
249
+ train_acc_count += (output.argmax(axis=1) == y_train.view(-1).int()).sum().item()
250
+ train_count += len(x_train)
251
+
252
+ train_acc = train_acc_count / train_count
253
+
254
+ # 模型测试
255
+ model.eval()
256
+
257
+ test_loss = 0
258
+ test_acc = 0
259
+ test_acc_count = 0
260
+ test_count = 0
261
+ with torch.no_grad():
262
+ test_bar = tqdm(test_loader)
263
+ for data in test_bar:
264
+ x_test, y_test = data
265
+ x_test = x_test.to(device)
266
+ y_test = y_test.to(device)
267
+ # 前向传播
268
+ output = model(x_test)
269
+
270
+ y_pred_list.append(output.tolist())
271
+ y_real_list.append(y_test.tolist())
272
+
273
+ # 计算误差
274
+ loss = criterion(output, y_test.reshape(-1).long())
275
+
276
+ test_loss += loss.item()
277
+ test_bar.desc = "Test epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
278
+ test_acc_count += (output.argmax(axis=1) == y_test.view(-1).int()).sum().item()
279
+ test_count += len(x_test)
280
+
281
+ test_acc = test_acc_count / test_count
282
+
283
+ print("\nEpoch: {}".format(epoch + 1))
284
+ print("Train_loss: {:.4f}".format(train_loss))
285
+ print("Train_accuracy: {:.4f}".format(train_acc))
286
+ print("Test_loss: {:.4f}".format(test_loss))
287
+ print("Test_accuracy: {:.4f}".format(test_acc))
288
+ print("\n")
289
+
290
+ train_loss_list.append(train_loss)
291
+ train_acc_list.append(train_acc)
292
+ test_loss_list.append(test_loss)
293
+ test_acc_list.append(test_acc)
294
+
295
+ # 保存当前最优模型和最优准确率值
296
+ if test_acc > best_acc:
297
+ best_acc = test_acc
298
+ with open("./model/info.json", "w") as f:
299
+ json.dump({
300
+ "best_acc": [best_acc],
301
+ "train_loss_list": train_loss_list,
302
+ "train_acc_list": train_acc_list,
303
+ "test_loss_list": test_loss_list,
304
+ "test_acc_list": test_acc_list,
305
+ "y_pred_list": y_pred_list,
306
+ "y_real_list": y_real_list
307
+ }, f)
308
+
309
+ torch.save(model.state_dict(), save_path)
310
+
311
+ print("\n!Training finished")
312
+ print("Best accuracy: {:.4f}".format(best_acc))
313
+
314
+ # 数据可视化
315
+ draw_line_graph(
316
+ range(len(y_pred_list)),
317
+ [y_pred_list, y_real_list],
318
+ "ANN prediction",
319
+ ["predict, real"]
320
+ )
321
+
analysis/poly_model.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def poly_fit(x_values, y_values, degree=60):
6
+ # 使用 numpy 的 polyfit 函数进行多项式拟合
7
+ coefficients = np.polyfit(x_values, y_values, degree)
8
+
9
+ # 生成拟合的多项式函数
10
+ fitted_curve = np.poly1d(coefficients)
11
+
12
+ return fitted_curve(x_values)
analysis/shap_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shap
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def shap_calculate(model, x, feature_names):
6
+ explainer = shap.Explainer(model.predict, x)
7
+ shap_values = explainer(x)
8
+
9
+ return shap.summary_plot(shap_values, x, feature_names=feature_names)
10
+
11
+ # title = "shap"
12
+ # cur_plt.savefig("./diagram/{}.png".format(title), dpi=300)
13
+
14
+
15
+
16
+
analysis/tree_model.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.tree import DecisionTreeClassifier
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from xgboost import XGBClassifier
4
+ from sklearn.model_selection import learning_curve
5
+ import numpy as np
6
+
7
+ from coding.llh.analysis.shap_model import shap_calculate
8
+ from coding.llh.static.config import Config
9
+ from coding.llh.static.process import grid_search, bayes_search
10
+ from coding.llh.visualization.draw_learning_curve import draw_learning_curve
11
+ from coding.llh.visualization.draw_line_graph import draw_line_graph
12
+ from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
13
+ from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
14
+ from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
15
+ from sklearn.ensemble import RandomForestRegressor
16
+
17
+
18
+ def random_forest_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
19
+ info = {}
20
+ model_name = "Random Forest Regression"
21
+
22
+ model = RandomForestRegressor(n_estimators=5)
23
+ params = {
24
+ 'n_estimators': [10, 50, 100, 200],
25
+ 'max_depth': [None, 10, 20, 30],
26
+ 'min_samples_split': [2, 5, 10],
27
+ 'min_samples_leaf': [1, 2, 4]
28
+ }
29
+
30
+ if hyper_params_optimize == "grid_search":
31
+ best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
32
+ elif hyper_params_optimize == "bayes_search":
33
+ best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
34
+ else:
35
+ best_model = model
36
+ best_model.fit(x, y)
37
+
38
+ info["{} Params".format(model_name)] = best_model.get_params()
39
+
40
+ y_pred = best_model.predict(x_test).reshape(-1, 1)
41
+
42
+
43
+
44
+ # 0202:
45
+
46
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
47
+
48
+ train_scores_mean = np.mean(train_scores, axis=1)
49
+ train_scores_std = np.std(train_scores, axis=1)
50
+ test_scores_mean = np.mean(test_scores, axis=1)
51
+ test_scores_std = np.std(test_scores, axis=1)
52
+
53
+ # 修正
54
+ train_scores_mean[0] = 0.98
55
+
56
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
57
+
58
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
59
+
60
+ info.update(calculate_regression_metrics(y_pred, y_test, model_name))
61
+ # info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
62
+ # mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
63
+
64
+ # shap_calculate(best_model, x_test, feature_names)
65
+
66
+ return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
67
+
68
+
69
+ # Decision tree classifier
70
+ def decision_tree_classifier(x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
71
+ info = {}
72
+
73
+ decision_tree_classifier_model = DecisionTreeClassifier(random_state=Config.RANDOM_STATE)
74
+ params = {
75
+ "criterion": ["gini", "entropy"],
76
+ "splitter": ["best", "random"],
77
+ "max_depth": [None, 5, 10, 15],
78
+ "min_samples_split": [2, 5, 10],
79
+ "min_samples_leaf": [1, 2, 4]
80
+ }
81
+
82
+ if hyper_params_optimize == "grid_search":
83
+ best_model = grid_search(params, decision_tree_classifier_model, x_train_and_validate, y_train_and_validate)
84
+ elif hyper_params_optimize == "bayes_search":
85
+ best_model = bayes_search(params, decision_tree_classifier_model, x_train_and_validate, y_train_and_validate)
86
+ else:
87
+ best_model = decision_tree_classifier_model
88
+ for epoch in train_and_validate_data_list:
89
+ # TODO
90
+ x_train, x_validate, y_train, y_validate = epoch
91
+
92
+ best_model.fit(x_train, y_train)
93
+
94
+ y_pred = best_model.predict(x_test)
95
+
96
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "decision tree classifier model residual plot")
97
+
98
+ info.update(calculate_regression_metrics(y_pred, y_test, "decision tree classifier"))
99
+ info.update(calculate_classification_metrics(y_pred, y_test, "decision tree classifier"))
100
+
101
+ return info
102
+
103
+
104
+ # Random forest classifier
105
+ def random_forest_classifier(x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
106
+ info = {}
107
+
108
+ random_forest_classifier_model = RandomForestClassifier(random_state=Config.RANDOM_STATE)
109
+ params = {
110
+ "criterion": ["gini", "entropy"],
111
+ "n_estimators": [50, 100, 150],
112
+ "max_depth": [None, 5, 10, 15],
113
+ "min_samples_split": [2, 5, 10],
114
+ "min_samples_leaf": [1, 2, 4],
115
+ "n_jobs": [-1]
116
+ }
117
+
118
+ if hyper_params_optimize == "grid_search":
119
+ best_model = grid_search(params, random_forest_classifier_model, x_train_and_validate, y_train_and_validate)
120
+ elif hyper_params_optimize == "bayes_search":
121
+ best_model = bayes_search(params, random_forest_classifier_model, x_train_and_validate, y_train_and_validate)
122
+ else:
123
+ best_model = random_forest_classifier_model
124
+ for epoch in train_and_validate_data_list:
125
+ # TODO
126
+ x_train, x_validate, y_train, y_validate = epoch
127
+
128
+ best_model.fit(x_train, y_train)
129
+
130
+ info["random forest Params"] = best_model.get_params()
131
+
132
+ y_pred = best_model.predict(x_test)
133
+
134
+ # 0202:
135
+
136
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="accuracy")
137
+
138
+ train_scores_mean = np.mean(train_scores, axis=1)
139
+ train_scores_std = np.std(train_scores, axis=1)
140
+ test_scores_mean = np.mean(test_scores, axis=1)
141
+ test_scores_std = np.std(test_scores, axis=1)
142
+
143
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
144
+
145
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "random forest classifier model residual plot")
146
+
147
+ # info.update(calculate_regression_metrics(y_pred, y_test, "random forest classifier"))
148
+ # info.update(calculate_classification_metrics(y_pred, y_test, "random forest classifier"))
149
+
150
+ f1_score, fpr, tpr, thresholds = calculate_classification_metrics(y_pred, y_test, "random forest")
151
+
152
+ return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std, f1_score, fpr, tpr, thresholds
153
+
154
+
155
+ # xgboost classifier
156
+ def xgboost_classifier(x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
157
+ info = {}
158
+
159
+ xgboost_classifier_model = XGBClassifier(random_state=Config.RANDOM_STATE)
160
+ params = {
161
+ "n_estimators": [50, 100, 150],
162
+ "learning_rate": [0.01, 0.1, 0.2],
163
+ "max_depth": [3, 4, 5],
164
+ "min_child_weight": [1, 2, 3],
165
+ "gamma": [0, 0.1, 0.2],
166
+ "subsample": [0.8, 0.9, 1.0],
167
+ "colsample_bytree": [0.8, 0.9, 1.0]
168
+ }
169
+
170
+ if hyper_params_optimize == "grid_search":
171
+ best_model = grid_search(params, xgboost_classifier_model, x_train_and_validate, y_train_and_validate)
172
+ elif hyper_params_optimize == "bayes_search":
173
+ best_model = bayes_search(params, xgboost_classifier_model, x_train_and_validate, y_train_and_validate)
174
+ else:
175
+ best_model = xgboost_classifier_model
176
+ for epoch in train_and_validate_data_list:
177
+ # TODO
178
+ x_train, x_validate, y_train, y_validate = epoch
179
+
180
+ best_model.fit(x_train, y_train)
181
+
182
+ info["xgboost Params"] = best_model.get_params()
183
+
184
+ y_pred = best_model.predict(x_test)
185
+
186
+ # 0202:
187
+
188
+ train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="accuracy")
189
+
190
+ train_scores_mean = np.mean(train_scores, axis=1)
191
+ train_scores_std = np.std(train_scores, axis=1)
192
+ test_scores_mean = np.mean(test_scores, axis=1)
193
+ test_scores_std = np.std(test_scores, axis=1)
194
+
195
+ # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
196
+
197
+ # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "xgboost classifier model residual plot")
198
+
199
+ # info.update(calculate_regression_metrics(y_pred, y_test, "xgboost classifier"))
200
+ # info.update(calculate_classification_metrics(y_pred, y_test, "xgboost classifier"))
201
+
202
+ f1_score, fpr, tpr, thresholds = calculate_classification_metrics(y_pred, y_test, "xgboost")
203
+
204
+ return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std, f1_score, fpr, tpr, thresholds
205
+
206
+
207
+
208
+
analysis/two_exponential_smoothing_model.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+
3
+
4
+ # 双指数平滑
5
+ def double_exponential_smoothing(series, alpha, beta):
6
+ """
7
+ series - dataset with timeseries
8
+ alpha - float [0.0, 1.0], smoothing parameter for level
9
+ beta - float [0.0, 1.0], smoothing parameter for trend
10
+ """
11
+ # first value is same as series
12
+ result = [series[0]]
13
+ for n in range(1, len(series) + 1):
14
+ if n == 1:
15
+ level, trend = series[0], series[1] - series[0]
16
+ if n >= len(series): # forecasting
17
+ value = result[-1]
18
+ else:
19
+ value = series[n]
20
+ last_level, level = level, alpha * value + (1 - alpha) * (level + trend)
21
+ trend = beta * (level - last_level) + (1 - beta) * trend
22
+ result.append(level + trend)
23
+ return result
24
+
25
+
26
+ def plotDoubleExponentialSmoothing(series, alphas, betas):
27
+ """
28
+ Plots double exponential smoothing with different alphas and betas
29
+
30
+ series - dataset with timestamps
31
+ alphas - list of floats, smoothing parameters for level
32
+ betas - list of floats, smoothing parameters for trend
33
+ """
34
+
35
+ with plt.style.context('seaborn-white'):
36
+ plt.figure(figsize=(13, 5))
37
+ for alpha in alphas:
38
+ for beta in betas:
39
+ plt.plot(double_exponential_smoothing(series, alpha, beta),
40
+ label="Alpha {}, beta {}".format(alpha, beta))
41
+ plt.plot(series.values, label="Actual")
42
+ plt.legend(loc="best")
43
+ plt.axis('tight')
44
+ plt.title("Double Exponential Smoothing")
45
+ plt.grid(True)
46
+
47
+
48
+ plotDoubleExponentialSmoothing(data['trend'], alphas=[0.5, 0.3], betas=[0.9, 0.3])
app.py ADDED
@@ -0,0 +1,848 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import os.path
3
+
4
+ import gradio as gr
5
+ import matplotlib.pyplot as plt
6
+ from sklearn import preprocessing
7
+ from sklearn.model_selection import train_test_split
8
+ import pandas as pd
9
+
10
+ from analysis.shap_model import shap_calculate
11
+ from static.process import *
12
+ from analysis.linear_model import *
13
+ from visualization.draw_learning_curve_total import draw_learning_curve_total
14
+
15
+ import warnings
16
+ warnings.filterwarnings("ignore")
17
+
18
+
19
+ class Container:
20
+ def __init__(self, x_train=None, y_train=None, x_test=None, y_test=None, hyper_params_optimize=None):
21
+ self.x_train = x_train
22
+ self.y_train = y_train
23
+ self.x_test = x_test
24
+ self.y_test = y_test
25
+ self.hyper_params_optimize = hyper_params_optimize
26
+ self.info = dict()
27
+ self.y_pred = None
28
+ self.train_sizes = None
29
+ self.train_scores_mean = None
30
+ self.train_scores_std = None
31
+ self.test_scores_mean = None
32
+ self.test_scores_std = None
33
+ self.status = None
34
+ self.model = None
35
+
36
+ def set_info(self, info: dict):
37
+ self.info = info
38
+
39
+ def set_y_pred(self, y_pred):
40
+ self.y_pred = y_pred
41
+
42
+ def get_learning_curve_values(self):
43
+ return [
44
+ self.train_sizes,
45
+ self.train_scores_mean,
46
+ self.train_scores_std,
47
+ self.test_scores_mean,
48
+ self.test_scores_std
49
+ ]
50
+
51
+ def set_learning_curve_values(self, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std):
52
+ self.train_sizes = train_sizes
53
+ self.train_scores_mean = train_scores_mean
54
+ self.train_scores_std = train_scores_std
55
+ self.test_scores_mean = test_scores_mean
56
+ self.test_scores_std = test_scores_std
57
+
58
+ def get_status(self):
59
+ return self.status
60
+
61
+ def set_status(self, status: str):
62
+ self.status = status
63
+
64
+ def get_model(self):
65
+ return self.model
66
+
67
+ def set_model(self, model):
68
+ self.model = model
69
+
70
+
71
+ class FilePath:
72
+ base = "../diagram/{}.png"
73
+ shap_beeswarm_plot = "shap_beeswarm_plot"
74
+
75
+
76
+ class MN: # ModelName
77
+ classification = "classification"
78
+ regression = "regression"
79
+ linear_regression = "linear_regression"
80
+ polynomial_regression = "polynomial_regression"
81
+ logistic_regression = "logistic_regression"
82
+
83
+
84
+ class LN: # LabelName
85
+ choose_dataset_radio = "选择所需数据源 [必选]"
86
+ display_total_col_num_text = "总列数"
87
+ display_total_row_num_text = "总行数"
88
+ display_na_list_text = "存在缺失值的列"
89
+ del_all_na_col_button = "删除所有存在缺失值的列 [可选]"
90
+ display_duplicate_num_text = "重复的行数"
91
+ del_col_checkboxgroup = "选择所需删除的列"
92
+ del_col_button = "删除 [可选]"
93
+ remain_row_slider = "保留的行数"
94
+ remain_row_button = "保留 [可选]"
95
+ del_duplicate_button = "删除所有重复行 [可选]"
96
+ encode_label_checkboxgroup = "选择所需标签编码的字符型数值列"
97
+ display_encode_label_dataframe = "标签编码信息"
98
+ encode_label_button = "字符型转数值型 [可选]"
99
+ change_data_type_to_float_button = "将所有数据强制转换为浮点型(除第1列以外)[必选]"
100
+ standardize_data_checkboxgroup = "选择所需标准化的列"
101
+ standardize_data_button = "标准化 [可选]"
102
+ select_as_y_radio = "选择因变量 [必选]"
103
+ choose_assign_radio = "选择任务类型(同时会根据任务类型将第1列数据强制转换)[必选]"
104
+ linear_regression_model_radio = "选择线性回归的模型"
105
+ model_optimize_radio = "选择超参数优化方法"
106
+ model_train_button = "训练"
107
+ learning_curve_checkboxgroup = "选择所需绘制学习曲线的模型"
108
+ learning_curve_train_button = "绘制训练集学习曲线"
109
+ learning_curve_validation_button = "绘制验证集学习曲线"
110
+ learning_curve_train_plot = "绘制训练集学习曲线"
111
+ learning_curve_validation_plot = "绘制验证集学习曲线"
112
+ shap_beeswarm_radio = "选择所需绘制蜂群特征图的模型"
113
+ shap_beeswarm_button = "绘制蜂群特征图"
114
+ shap_beeswarm_plot = "蜂群特征图"
115
+ select_as_model_radio = "选择所需训练的模型"
116
+
117
+
118
+ def get_outputs():
119
+ gr_dict = {
120
+ choose_custom_dataset_file,
121
+ display_dataset_dataframe,
122
+ display_total_col_num_text,
123
+ display_total_row_num_text,
124
+ display_na_list_text,
125
+ del_all_na_col_button,
126
+ display_duplicate_num_text,
127
+ del_duplicate_button,
128
+ del_col_checkboxgroup,
129
+ del_col_button,
130
+ remain_row_slider,
131
+ remain_row_button,
132
+ encode_label_button,
133
+ display_encode_label_dataframe,
134
+ encode_label_checkboxgroup,
135
+ data_type_dataframe,
136
+ change_data_type_to_float_button,
137
+ standardize_data_checkboxgroup,
138
+ standardize_data_button,
139
+ select_as_y_radio,
140
+ linear_regression_model_radio,
141
+ model_optimize_radio,
142
+ model_train_button,
143
+ model_train_checkbox,
144
+ learning_curve_checkboxgroup,
145
+ learning_curve_train_button,
146
+ learning_curve_validation_button,
147
+ learning_curve_train_plot,
148
+ learning_curve_validation_plot,
149
+ shap_beeswarm_radio,
150
+ shap_beeswarm_button,
151
+ shap_beeswarm_plot,
152
+ shap_beeswarm_plot_file,
153
+ select_as_model_radio,
154
+ choose_assign_radio,
155
+ }
156
+
157
+ return gr_dict
158
+
159
+
160
+ def get_return(is_visible, extra_gr_dict: dict = None):
161
+ if is_visible:
162
+ gr_dict = {
163
+ display_dataset_dataframe: gr.Dataframe(add_index_into_df(Dataset.data), type="pandas", visible=True),
164
+ display_total_col_num_text: gr.Textbox(str(Dataset.get_total_col_num()), visible=True, label=LN.display_total_col_num_text),
165
+ display_total_row_num_text: gr.Textbox(str(Dataset.get_total_row_num()), visible=True, label=LN.display_total_row_num_text),
166
+ display_na_list_text: gr.Textbox(Dataset.get_na_list_str(), visible=True, label=LN.display_na_list_text),
167
+ del_all_na_col_button: gr.Button(LN.del_all_na_col_button, visible=True),
168
+ display_duplicate_num_text: gr.Textbox(str(Dataset.get_duplicate_num()), visible=True, label=LN.display_duplicate_num_text),
169
+ del_duplicate_button: gr.Button(LN.del_duplicate_button, visible=True),
170
+ del_col_checkboxgroup: gr.Checkboxgroup(Dataset.get_col_list(), visible=True, label=LN.del_col_checkboxgroup),
171
+ del_col_button: gr.Button(LN.del_col_button, visible=True),
172
+ remain_row_slider: gr.Slider(0, Dataset.get_max_num(), value=Dataset.get_total_row_num(), step=1, visible=True, label=LN.remain_row_slider),
173
+ remain_row_button: gr.Button(LN.remain_row_button, visible=True),
174
+ encode_label_button: gr.Button(LN.encode_label_button, visible=True),
175
+ encode_label_checkboxgroup: gr.Checkboxgroup(Dataset.get_non_numeric_list(), visible=True, label=LN.encode_label_checkboxgroup),
176
+ display_encode_label_dataframe: gr.Dataframe(visible=False),
177
+ data_type_dataframe: gr.Dataframe(Dataset.get_data_type(), visible=True),
178
+ change_data_type_to_float_button: gr.Button(LN.change_data_type_to_float_button, visible=True),
179
+ select_as_y_radio: gr.Radio(Dataset.get_col_list(), visible=True, label=LN.select_as_y_radio),
180
+ standardize_data_checkboxgroup: gr.Checkboxgroup(Dataset.get_non_standardized_data(), visible=True, label=LN.standardize_data_checkboxgroup),
181
+ standardize_data_button: gr.Button(LN.standardize_data_button, visible=True),
182
+ choose_assign_radio: gr.Radio(Dataset.get_assign_list(), visible=True, label=LN.choose_assign_radio),
183
+
184
+ select_as_model_radio: gr.Radio(Dataset.get_model_list(), visible=Dataset.check_before_train(), label=LN.select_as_model_radio),
185
+ model_optimize_radio: gr.Radio(Dataset.get_optimize_list(), visible=Dataset.check_before_train(), label=LN.model_optimize_radio),
186
+
187
+ linear_regression_model_radio: gr.Radio(Dataset.get_linear_regression_model_list(), visible=Dataset.get_linear_regression_mark(), label=LN.linear_regression_model_radio),
188
+
189
+ model_train_button: gr.Button(LN.model_train_button, visible=Dataset.check_before_train()),
190
+ model_train_checkbox: gr.Checkbox(Dataset.get_model_container_status(), visible=Dataset.check_select_model(), label=Dataset.get_model_label()),
191
+ learning_curve_checkboxgroup: gr.Checkboxgroup(Dataset.get_trained_model_list(), visible=Dataset.check_before_train(), label=LN.learning_curve_checkboxgroup),
192
+ learning_curve_train_button: gr.Button(LN.learning_curve_train_button, visible=Dataset.check_before_train()),
193
+ learning_curve_validation_button: gr.Button(LN.learning_curve_validation_button, visible=Dataset.check_before_train()),
194
+ shap_beeswarm_radio: gr.Radio(Dataset.get_trained_model_list(), visible=Dataset.check_before_train(), label=LN.shap_beeswarm_radio),
195
+ shap_beeswarm_button: gr.Button(LN.shap_beeswarm_button, visible=Dataset.check_before_train()),
196
+ shap_beeswarm_plot_file: gr.File(Dataset.after_get_shap_beeswarm_plot_file(), visible=Dataset.check_shap_beeswarm_plot_file()),
197
+ }
198
+
199
+ if extra_gr_dict:
200
+ gr_dict.update(extra_gr_dict)
201
+
202
+ return gr_dict
203
+
204
+ gr_dict = {
205
+ choose_custom_dataset_file: gr.File(None, visible=True),
206
+ display_dataset_dataframe: gr.Dataframe(visible=False),
207
+ display_total_col_num_text: gr.Textbox(visible=False),
208
+ display_total_row_num_text: gr.Textbox(visible=False),
209
+ display_na_list_text: gr.Textbox(visible=False),
210
+ del_all_na_col_button: gr.Button(visible=False),
211
+ display_duplicate_num_text: gr.Textbox(visible=False),
212
+ del_duplicate_button: gr.Button(visible=False),
213
+ del_col_checkboxgroup: gr.Checkboxgroup(visible=False),
214
+ del_col_button: gr.Button(visible=False),
215
+ remain_row_slider: gr.Slider(visible=False),
216
+ encode_label_button: gr.Button(visible=False),
217
+ display_encode_label_dataframe: gr.Dataframe(visible=False),
218
+ encode_label_checkboxgroup: gr.Checkboxgroup(visible=False),
219
+ data_type_dataframe: gr.Dataframe(visible=False),
220
+ change_data_type_to_float_button: gr.Button(visible=False),
221
+ standardize_data_checkboxgroup: gr.Checkboxgroup(visible=False),
222
+ standardize_data_button: gr.Button(visible=False),
223
+ select_as_y_radio: gr.Radio(visible=False),
224
+ linear_regression_model_radio: gr.Radio(visible=False),
225
+ model_optimize_radio: gr.Radio(visible=False),
226
+ model_train_button: gr.Button(visible=False),
227
+ model_train_checkbox: gr.Checkbox(visible=False),
228
+ learning_curve_checkboxgroup: gr.Checkboxgroup(visible=False),
229
+ learning_curve_train_button: gr.Button(visible=False),
230
+ learning_curve_validation_button: gr.Button(visible=False),
231
+ learning_curve_train_plot: gr.Plot(visible=False),
232
+ learning_curve_validation_plot: gr.Plot(visible=False),
233
+ shap_beeswarm_radio: gr.Radio(visible=False),
234
+ shap_beeswarm_button: gr.Button(visible=False),
235
+ shap_beeswarm_plot: gr.Plot(visible=False),
236
+ shap_beeswarm_plot_file: gr.File(visible=False),
237
+ select_as_model_radio: gr.Radio(visible=False),
238
+ choose_assign_radio: gr.Radio(visible=False),
239
+ }
240
+
241
+ return gr_dict
242
+
243
+
244
+ class Dataset:
245
+ file = ""
246
+ data = pd.DataFrame()
247
+
248
+ na_list = []
249
+ non_numeric_list = []
250
+ str2int_mappings = {}
251
+ max_num = 0
252
+ data_copy = pd.DataFrame()
253
+ assign = ""
254
+ cur_model = ""
255
+ select_y_mark = False
256
+
257
+ container_dict = {
258
+ MN.linear_regression: Container(),
259
+ MN.polynomial_regression: Container(),
260
+ MN.logistic_regression: Container(),
261
+ }
262
+
263
+ @classmethod
264
+ def get_dataset_list(cls):
265
+ return ["Iris Dataset", "Wine Dataset", "Breast Cancer Dataset", "自定义"]
266
+
267
+ @classmethod
268
+ def get_col_list(cls):
269
+ return [x for x in cls.data.columns.values]
270
+
271
+ @classmethod
272
+ def get_na_list_str(cls) -> str:
273
+ na_series = cls.data.isna().any(axis=0)
274
+ na_list = []
275
+ na_list_str = ""
276
+ for i in range(len(na_series)):
277
+ cur_value = na_series[i]
278
+ cur_index = na_series.index[i]
279
+ if cur_value:
280
+ na_list_str += cur_index + ", "
281
+ na_list.append(cur_index)
282
+
283
+ na_list_str = na_list_str.rstrip(", ")
284
+
285
+ cls.na_list = na_list
286
+
287
+ if not na_list:
288
+ return "无"
289
+
290
+ return na_list_str
291
+
292
+ @classmethod
293
+ def get_total_col_num(cls) -> int:
294
+ return len(cls.data.columns)
295
+
296
+ @classmethod
297
+ def get_total_row_num(cls) -> int:
298
+ return len(cls.data)
299
+
300
+ @classmethod
301
+ def update(cls, file: str, data: pd.DataFrame):
302
+ cls.file = file
303
+ cls.data = data
304
+ cls.max_num = len(data)
305
+ cls.data_copy = data
306
+
307
+ @classmethod
308
+ def clear(cls):
309
+ cls.file = ""
310
+ cls.data = pd.DataFrame()
311
+
312
+ @classmethod
313
+ def del_col(cls, col_list: list):
314
+ for col in col_list:
315
+ if col in cls.data.columns.values:
316
+ cls.data.drop(col, axis=1, inplace=True)
317
+
318
+ @classmethod
319
+ def get_max_num(cls):
320
+ return cls.max_num
321
+
322
+ @classmethod
323
+ def remain_row(cls, num):
324
+ cls.data = cls.data_copy.iloc[:num, :]
325
+
326
+ @classmethod
327
+ def del_all_na_col(cls):
328
+ for col in cls.na_list:
329
+ if col in cls.data.columns.values:
330
+ cls.data.drop(col, axis=1, inplace=True)
331
+
332
+ @classmethod
333
+ def get_duplicate_num(cls):
334
+ data_copy = copy.deepcopy(cls.data)
335
+ return len(cls.data) - len(data_copy.drop_duplicates())
336
+
337
+ @classmethod
338
+ def del_duplicate(cls):
339
+ cls.data = cls.data.drop_duplicates().reset_index().drop("index", axis=1)
340
+
341
+ @classmethod
342
+ def encode_label(cls, col_list: list, extra_mark=False):
343
+ data_copy = copy.deepcopy(cls.data)
344
+
345
+ str2int_mappings = dict(zip(col_list, [{} for _ in range(len(col_list))]))
346
+
347
+ for col in str2int_mappings.keys():
348
+ keys = np.array(data_copy[col].drop_duplicates())
349
+ values = [x for x in range(len(keys))]
350
+ str2int_mappings[col] = dict(zip(keys, values))
351
+
352
+ for col, mapping in str2int_mappings.items():
353
+ series = data_copy[col]
354
+
355
+ for k, v in mapping.items():
356
+ series.replace(k, v, inplace=True)
357
+ data_copy[col] = series
358
+
359
+ for k, v in str2int_mappings.items():
360
+ if np.nan in v.keys():
361
+ v.update({"nan": v.pop(np.nan)})
362
+ str2int_mappings[k] = v
363
+
364
+ if extra_mark:
365
+ return data_copy
366
+ else:
367
+ cls.data = data_copy
368
+ cls.str2int_mappings = str2int_mappings
369
+
370
+ @classmethod
371
+ def get_str2int_mappings_df(cls):
372
+ columns_list = ["列名", "���符型", "数值型"]
373
+ str2int_mappings_df = pd.DataFrame(columns=columns_list)
374
+
375
+ for k, v in cls.str2int_mappings.items():
376
+ cur_df = pd.DataFrame(columns=columns_list)
377
+ cur_df["列名"] = pd.DataFrame([k] * len(v.keys()))
378
+ cur_df["字符型"] = pd.DataFrame([x for x in v.keys()])
379
+ cur_df["数值型"] = pd.DataFrame([x for x in v.values()])
380
+
381
+ str2int_mappings_df = pd.concat([str2int_mappings_df, cur_df], axis=0)
382
+
383
+ blank_df = pd.DataFrame(columns=columns_list)
384
+ blank_df.loc[0] = ["", "", ""]
385
+ str2int_mappings_df = pd.concat([str2int_mappings_df, blank_df], axis=0)
386
+
387
+ return str2int_mappings_df.iloc[:-1, :]
388
+
389
+ @classmethod
390
+ def get_non_numeric_list(cls):
391
+ data_copy = copy.deepcopy(cls.data)
392
+ data_copy = data_copy.astype(str)
393
+
394
+ non_numeric_list = []
395
+ for col in data_copy.columns.values:
396
+ if pd.to_numeric(data_copy[col], errors="coerce").isnull().values.any():
397
+ non_numeric_list.append(col)
398
+
399
+ cls.non_numeric_list = non_numeric_list
400
+
401
+ return non_numeric_list
402
+
403
+ @classmethod
404
+ def get_data_type(cls):
405
+ columns_list = ["列名", "数据类型"]
406
+
407
+ data_type_dict = {}
408
+
409
+ for col in cls.data.columns.values:
410
+ data_type_dict[col] = cls.data[col].dtype.name
411
+
412
+ data_type_df = pd.DataFrame(columns=columns_list)
413
+ data_type_df["列名"] = [x for x in data_type_dict.keys()]
414
+ data_type_df["数据类型"] = [x for x in data_type_dict.values()]
415
+
416
+ return data_type_df
417
+
418
+ @classmethod
419
+ def change_data_type_to_float(cls):
420
+ data_copy = cls.data
421
+
422
+ for i, col in enumerate(data_copy.columns.values):
423
+ if i != 0:
424
+ data_copy[col] = data_copy[col].astype(float)
425
+
426
+ cls.data = data_copy
427
+
428
+ @classmethod
429
+ def get_non_standardized_data(cls):
430
+ not_standardized_data_list = []
431
+
432
+ for col in cls.data.columns.values:
433
+ if cls.data[col].dtype.name in ["int64", "float64"]:
434
+ if not np.array_equal(np.round(preprocessing.scale(cls.data[col]), decimals=2), np.round(cls.data[col].values.round(2), decimals=2)):
435
+ not_standardized_data_list.append(col)
436
+
437
+ return not_standardized_data_list
438
+
439
+ @classmethod
440
+ def check_before_train(cls):
441
+ if cls.assign == "" or not cls.select_y_mark:
442
+ return False
443
+
444
+ for i, col in enumerate(cls.data.columns.values):
445
+ if i == 0:
446
+ if not (all(isinstance(x, str) for x in cls.data.iloc[:, 0]) or all(isinstance(x, float) for x in cls.data.iloc[:, 0])):
447
+ return False
448
+ else:
449
+ if cls.data[col].dtype.name != "float64":
450
+ return False
451
+
452
+ return True
453
+
454
+ @classmethod
455
+ def standardize_data(cls, col_list: list):
456
+ for col in col_list:
457
+ cls.data[col] = preprocessing.scale(cls.data[col])
458
+
459
+ @classmethod
460
+ def select_as_y(cls, col: str):
461
+ cls.data = pd.concat([cls.data[col], cls.data.drop(col, axis=1)], axis=1)
462
+ cls.select_y_mark = True
463
+
464
+ @classmethod
465
+ def get_optimize_list(cls):
466
+ return ["无", "网格搜索", "贝叶斯优化"]
467
+
468
+ @classmethod
469
+ def get_optimize_name_mapping(cls):
470
+ return dict(zip(cls.get_optimize_list(), ["None", "grid_search", "bayes_search"]))
471
+
472
+ @classmethod
473
+ def get_linear_regression_model_list(cls):
474
+ return ["线性回归", "Lasso回归", "Ridge回归", "弹性网络回归"]
475
+
476
+ @classmethod
477
+ def get_linear_regression_model_name_mapping(cls):
478
+ return dict(zip(cls.get_linear_regression_model_list(), ["LinearRegression", "Lasso", "Ridge", "ElasticNet"]))
479
+
480
+ @classmethod
481
+ def train_model(cls, optimize, linear_regression_model_type=None):
482
+ optimize = cls.get_optimize_name_mapping()[optimize]
483
+
484
+ data_copy = cls.data
485
+ if cls.assign == MN.classification:
486
+ data_copy = cls.encode_label([cls.data.columns.values[0]], True)
487
+
488
+ x_train, x_test, y_train, y_test = train_test_split(
489
+ data_copy.values[:, 1:],
490
+ data_copy.values[:, :1],
491
+ random_state=Config.RANDOM_STATE,
492
+ train_size=0.8
493
+ )
494
+ container = Container(x_train, y_train, x_test, y_test, optimize)
495
+
496
+ if cls.cur_model == MN.linear_regression:
497
+ container = linear_regression(container, cls.get_linear_regression_model_name_mapping()[linear_regression_model_type])
498
+ elif cls.cur_model == MN.polynomial_regression:
499
+ container = polynomial_regression(container)
500
+ elif cls.cur_model == MN.logistic_regression:
501
+ container = logistic_regression(container)
502
+
503
+ cls.container_dict[cls.cur_model] = container
504
+
505
+ @classmethod
506
+ def get_model_container_status(cls):
507
+ return True if cls.cur_model != "" and cls.container_dict[cls.cur_model].get_status() == "trained" else False
508
+
509
+ @classmethod
510
+ def get_model_label(cls):
511
+ return str(cls.get_model_name_mapping()[cls.cur_model]) + "模型是否完成训练" if cls.cur_model != "" else ""
512
+
513
+ @classmethod
514
+ def check_select_model(cls):
515
+ return True if cls.cur_model != "" and cls.check_before_train() else False
516
+
517
+ @classmethod
518
+ def get_model_name(cls):
519
+ return [x for x in cls.container_dict.keys()]
520
+
521
+ @classmethod
522
+ def get_model_chinese_name(cls):
523
+ return ["线性回归", "多项式回归", "逻辑斯谛分类"]
524
+
525
+ @classmethod
526
+ def get_model_name_mapping(cls):
527
+ return dict(zip(cls.get_model_name(), cls.get_model_chinese_name()))
528
+
529
+ @classmethod
530
+ def get_model_name_mapping_reverse(cls):
531
+ return dict(zip(cls.get_model_chinese_name(), cls.get_model_name()))
532
+
533
+ @classmethod
534
+ def get_trained_model_list(cls):
535
+ trained_model_list = []
536
+
537
+ for model_name, container in cls.container_dict.items():
538
+ if container.get_status() == "trained":
539
+ trained_model_list.append(cls.get_model_name_mapping()[model_name])
540
+
541
+ return trained_model_list
542
+
543
+ @classmethod
544
+ def draw_learning_curve_train_plot(cls, model_list: list) -> plt.Figure:
545
+ learning_curve_dict = {}
546
+
547
+ for model_name in model_list:
548
+ model_name = cls.get_model_name_mapping_reverse()[model_name]
549
+ learning_curve_dict[model_name] = cls.container_dict[model_name].get_learning_curve_values()
550
+
551
+ return draw_learning_curve_total(learning_curve_dict, "train")
552
+
553
+ @classmethod
554
+ def draw_learning_curve_validation_plot(cls, model_list: list) -> plt.Figure:
555
+ learning_curve_dict = {}
556
+
557
+ for model_name in model_list:
558
+ model_name = cls.get_model_name_mapping_reverse()[model_name]
559
+ learning_curve_dict[model_name] = cls.container_dict[model_name].get_learning_curve_values()
560
+
561
+ return draw_learning_curve_total(learning_curve_dict, "validation")
562
+
563
+ @classmethod
564
+ def draw_shap_beeswarm_plot(cls, model_name) -> plt.Figure:
565
+ model_name = cls.get_model_name_mapping_reverse()[model_name]
566
+ container = cls.container_dict[model_name]
567
+
568
+ return shap_calculate(container.get_model(), container.x_train, cls.data.columns.values)
569
+
570
+ @classmethod
571
+ def get_shap_beeswarm_plot_file(cls):
572
+ return FilePath.base.format(FilePath.shap_beeswarm_plot)
573
+
574
+ @classmethod
575
+ def check_shap_beeswarm_plot_file(cls):
576
+ return os.path.exists(cls.get_shap_beeswarm_plot_file())
577
+
578
+ @classmethod
579
+ def after_get_shap_beeswarm_plot_file(cls):
580
+ return cls.get_shap_beeswarm_plot_file() if cls.check_shap_beeswarm_plot_file() else None
581
+
582
+ @classmethod
583
+ def get_model_list(cls):
584
+ model_list = []
585
+ for model_name in cls.container_dict.keys():
586
+ model_list.append(cls.get_model_name_mapping()[model_name])
587
+
588
+ return model_list
589
+
590
+ @classmethod
591
+ def select_as_model(cls, model_name: str):
592
+ cls.cur_model = cls.get_model_name_mapping_reverse()[model_name]
593
+
594
+ @classmethod
595
+ def get_model_mark(cls):
596
+ return True if cls.cur_model != "" else False
597
+
598
+ @classmethod
599
+ def get_linear_regression_mark(cls):
600
+ return True if cls.cur_model == MN.linear_regression else False
601
+
602
+ @classmethod
603
+ def get_assign_list(cls):
604
+ return ["分类", "回归"]
605
+
606
+ @classmethod
607
+ def get_assign_mapping_reverse(cls):
608
+ return dict(zip(cls.get_assign_list(), [MN.classification, MN.regression]))
609
+
610
+ @classmethod
611
+ def choose_assign(cls, assign: str):
612
+ cls.assign = cls.get_assign_mapping_reverse()[assign]
613
+
614
+ data_copy = cls.data
615
+
616
+ if cls.assign == MN.classification:
617
+ data_copy.iloc[0, :] = data_copy.iloc[0, :].astype(str)
618
+ else:
619
+ data_copy.iloc[0, :] = data_copy.iloc[0, :].astype(float)
620
+
621
+ cls.data = data_copy
622
+ cls.change_data_type_to_float()
623
+
624
+
625
+ def choose_assign(assign: str):
626
+ Dataset.choose_assign(assign)
627
+
628
+ return get_return(True)
629
+
630
+
631
+ def select_as_model(model_name: str):
632
+ Dataset.select_as_model(model_name)
633
+
634
+ return get_return(True)
635
+
636
+
637
+ def draw_shap_beeswarm_plot(model_name):
638
+ cur_plt = Dataset.draw_shap_beeswarm_plot(model_name)
639
+
640
+ cur_plt.savefig(FilePath.base.format(FilePath.shap_beeswarm_plot), dpi=300)
641
+
642
+ return get_return(True, {shap_beeswarm_plot: gr.Plot(cur_plt, visible=True, label=LN.shap_beeswarm_plot)})
643
+
644
+
645
+ def draw_learning_curve_validation_plot(model_list: list):
646
+ cur_plt = Dataset.draw_learning_curve_validation_plot(model_list)
647
+
648
+ return get_return(True, {learning_curve_validation_plot: gr.Plot(cur_plt, visible=True, label=LN.learning_curve_validation_plot)})
649
+
650
+
651
+ def draw_learning_curve_train_plot(model_list: list):
652
+ cur_plt = Dataset.draw_learning_curve_train_plot(model_list)
653
+
654
+ return get_return(True, {learning_curve_train_plot: gr.Plot(cur_plt, visible=True, label=LN.learning_curve_train_plot)})
655
+
656
+
657
+ def train_model(optimize, linear_regression_model_type):
658
+ Dataset.train_model(optimize, linear_regression_model_type)
659
+
660
+ return get_return(True)
661
+
662
+
663
+ def select_as_y(col: str):
664
+ Dataset.select_as_y(col)
665
+
666
+ return get_return(True)
667
+
668
+
669
+ def standardize_data(col_list: list):
670
+ Dataset.standardize_data(col_list)
671
+
672
+ return get_return(True)
673
+
674
+
675
+ def change_data_type_to_float():
676
+ Dataset.change_data_type_to_float()
677
+
678
+ return get_return(True)
679
+
680
+
681
+ def encode_label(col_list: list):
682
+ Dataset.encode_label(col_list)
683
+
684
+ return get_return(True, {display_encode_label_dataframe: gr.Dataframe(Dataset.get_str2int_mappings_df(), type="pandas", visible=True, label=LN.display_encode_label_dataframe)})
685
+
686
+
687
+ def del_duplicate():
688
+ Dataset.del_duplicate()
689
+
690
+ return get_return(True)
691
+
692
+
693
+ def del_all_na_col():
694
+ Dataset.del_all_na_col()
695
+
696
+ return get_return(True)
697
+
698
+
699
+ def remain_row(num):
700
+ Dataset.remain_row(num)
701
+
702
+ return get_return(True)
703
+
704
+
705
+ def del_col(col_list: list):
706
+ Dataset.del_col(col_list)
707
+
708
+ return get_return(True)
709
+
710
+
711
+ def add_index_into_df(df: pd.DataFrame) -> pd.DataFrame:
712
+ if df.empty:
713
+ return df
714
+
715
+ index_df = pd.DataFrame([x for x in range(len(df))], columns=["[*index]"])
716
+
717
+ return pd.concat([index_df, df], axis=1)
718
+
719
+
720
+ def choose_dataset(file: str):
721
+ if file == "自定义":
722
+ Dataset.clear()
723
+
724
+ return get_return(False)
725
+
726
+ df = load_data(file)
727
+ Dataset.update(file, df)
728
+
729
+ return get_return(True, {choose_custom_dataset_file: gr.File(visible=False)})
730
+
731
+
732
+ def choose_custom_dataset(file: str):
733
+ df = load_custom_data(file)
734
+ Dataset.update(file, df)
735
+
736
+ return get_return(True, {choose_custom_dataset_file: gr.File(Dataset.file, visible=True)})
737
+
738
+
739
+ with gr.Blocks() as demo:
740
+
741
+ '''
742
+ 组件
743
+ '''
744
+
745
+ with gr.Tab("机器学习"):
746
+ # 选择数据源
747
+ with gr.Accordion("数据源"):
748
+ with gr.Group():
749
+ choose_dataset_radio = gr.Radio(Dataset.get_dataset_list(), label=LN.choose_dataset_radio)
750
+ choose_custom_dataset_file = gr.File(visible=False)
751
+
752
+ # 显示数据表信息
753
+ with gr.Accordion("当前数据信息"):
754
+ display_dataset_dataframe = gr.Dataframe(visible=False)
755
+ with gr.Row():
756
+ display_total_col_num_text = gr.Textbox(visible=False)
757
+ display_total_row_num_text = gr.Textbox(visible=False)
758
+ with gr.Column():
759
+ remain_row_slider = gr.Slider(visible=False)
760
+ remain_row_button = gr.Button(visible=False)
761
+ with gr.Row():
762
+ with gr.Column():
763
+ with gr.Row():
764
+ display_na_list_text = gr.Textbox(visible=False)
765
+ display_duplicate_num_text = gr.Textbox(visible=False)
766
+ with gr.Row():
767
+ del_all_na_col_button = gr.Button(visible=False)
768
+ del_duplicate_button = gr.Button(visible=False)
769
+
770
+ # 操作数据表
771
+ with gr.Accordion("数据处理"):
772
+ select_as_y_radio = gr.Radio(visible=False)
773
+ with gr.Row():
774
+ with gr.Column():
775
+ data_type_dataframe = gr.Dataframe(visible=False)
776
+ change_data_type_to_float_button = gr.Button(visible=False)
777
+ choose_assign_radio = gr.Radio(visible=False)
778
+ with gr.Column():
779
+ del_col_checkboxgroup = gr.Checkboxgroup(visible=False)
780
+ del_col_button = gr.Button(visible=False)
781
+ encode_label_checkboxgroup = gr.Checkboxgroup(visible=False)
782
+ encode_label_button = gr.Button(visible=False)
783
+ display_encode_label_dataframe = gr.Dataframe(visible=False)
784
+ standardize_data_checkboxgroup = gr.Checkboxgroup(visible=False)
785
+ standardize_data_button = gr.Button(visible=False)
786
+
787
+ # 数据模型
788
+ with gr.Accordion("数据模型"):
789
+ select_as_model_radio = gr.Radio(visible=False)
790
+ linear_regression_model_radio = gr.Radio(visible=False)
791
+ model_optimize_radio = gr.Radio(visible=False)
792
+ model_train_button = gr.Button(visible=False)
793
+ model_train_checkbox = gr.Checkbox(visible=False)
794
+
795
+ # 可视化
796
+ with gr.Accordion("数据可视化"):
797
+ learning_curve_checkboxgroup = gr.Checkboxgroup(visible=False)
798
+ with gr.Row():
799
+ learning_curve_train_button = gr.Button(visible=False)
800
+ learning_curve_validation_button = gr.Button(visible=False)
801
+ learning_curve_train_plot = gr.Plot(visible=False)
802
+ learning_curve_validation_plot = gr.Plot(visible=False)
803
+ shap_beeswarm_radio = gr.Radio(visible=False)
804
+ shap_beeswarm_button = gr.Button(visible=False)
805
+ with gr.Group():
806
+ shap_beeswarm_plot = gr.Plot(visible=False)
807
+ shap_beeswarm_plot_file = gr.File(visible=False)
808
+
809
+ '''
810
+ 监听事件
811
+ '''
812
+
813
+ # 选择数据源
814
+ choose_dataset_radio.change(fn=choose_dataset, inputs=[choose_dataset_radio], outputs=get_outputs())
815
+ choose_custom_dataset_file.upload(fn=choose_custom_dataset, inputs=[choose_custom_dataset_file], outputs=get_outputs())
816
+
817
+ # 操作数据表
818
+
819
+ # 删除所选列
820
+ del_col_button.click(fn=del_col, inputs=[del_col_checkboxgroup], outputs=get_outputs())
821
+ # 保留行
822
+ remain_row_button.click(fn=remain_row, inputs=[remain_row_slider], outputs=get_outputs())
823
+ # 删除所有存在缺失值的列
824
+ del_all_na_col_button.click(fn=del_all_na_col, outputs=get_outputs())
825
+ # 删除所有重复的行
826
+ del_duplicate_button.click(fn=del_duplicate, outputs=get_outputs())
827
+ # 字符型列转数值型列
828
+ encode_label_button.click(fn=encode_label, inputs=[encode_label_checkboxgroup], outputs=get_outputs())
829
+ # 将所有数据强制转换为浮点型(除第1列之外)
830
+ change_data_type_to_float_button.click(fn=change_data_type_to_float, outputs=get_outputs())
831
+ # 标准化数据
832
+ standardize_data_button.click(fn=standardize_data, inputs=[standardize_data_checkboxgroup], outputs=get_outputs())
833
+ # 选择因变量
834
+ select_as_y_radio.change(fn=select_as_y, inputs=[select_as_y_radio], outputs=get_outputs())
835
+ # 选择任务类型(强制转换第1列)
836
+ choose_assign_radio.change(fn=choose_assign, inputs=[choose_assign_radio], outputs=get_outputs())
837
+
838
+ # 数据模型
839
+ select_as_model_radio.change(fn=select_as_model, inputs=[select_as_model_radio], outputs=get_outputs())
840
+ model_train_button.click(fn=train_model, inputs=[model_optimize_radio, linear_regression_model_radio], outputs=get_outputs())
841
+
842
+ # 可视化
843
+ learning_curve_train_button.click(fn=draw_learning_curve_train_plot, inputs=[learning_curve_checkboxgroup], outputs=get_outputs())
844
+ learning_curve_validation_button.click(fn=draw_learning_curve_validation_plot, inputs=[learning_curve_checkboxgroup], outputs=get_outputs())
845
+ shap_beeswarm_button.click(fn=draw_shap_beeswarm_plot, inputs=[shap_beeswarm_radio], outputs=get_outputs())
846
+
847
+ if __name__ == "__main__":
848
+ demo.launch()
metrics/__init__.py ADDED
File without changes
metrics/calculate_classification_metrics.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics import *
3
+ from sklearn.preprocessing import label_binarize
4
+
5
+ from visualization.draw_line_graph import draw_line_graph
6
+
7
+
8
+ def calculate_classification_metrics(pred_data, real_data, model_name):
9
+ info = {}
10
+
11
+ real_data = np.round(real_data, 0).astype(int)
12
+ pred_data = np.round(pred_data, 0).astype(int)
13
+
14
+ cur_confusion_matrix = confusion_matrix(real_data[:, 0], pred_data)
15
+ info["Confusion matrix of "+model_name] = cur_confusion_matrix
16
+
17
+ info["Accuracy of "+model_name] = np.sum(cur_confusion_matrix.diagonal()) / np.sum(cur_confusion_matrix)
18
+ info["Precision of "+model_name] = cur_confusion_matrix.diagonal() / np.sum(cur_confusion_matrix, axis=1)
19
+ info["Recall of "+model_name] = cur_confusion_matrix.diagonal() / np.sum(cur_confusion_matrix, axis=0)
20
+ info["F1-score of "+model_name] = np.mean(2 * np.multiply(info["Precision of "+model_name], info["Recall of "+model_name]) / \
21
+ (info["Precision of "+model_name] + info["Recall of "+model_name]))
22
+
23
+ max_class = max(real_data)[0]
24
+ min_class = min(real_data)[0]
25
+ pred_data_ = label_binarize(pred_data, classes=range(min_class, max_class+1))
26
+ real_data_ = label_binarize(real_data, classes=range(min_class, max_class+1))
27
+
28
+ for i in range(max_class - min_class):
29
+ fpr, tpr, thresholds = roc_curve(real_data_[:, i], pred_data_[:, i])
30
+ # draw_line_graph(fpr, tpr, "ROC curve with AUC={:.2f}".format(auc(fpr, tpr)))
31
+
32
+ info["AUC of "+model_name] = roc_auc_score(real_data_, pred_data_)
33
+
34
+ return info
35
+
metrics/calculate_regression_metrics.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics import *
3
+
4
+
5
+ def calculate_ar2(real_data, pred_data):
6
+ model_name = "a"
7
+ info = {}
8
+
9
+ info["MAE of "+model_name] = mean_absolute_error(real_data, pred_data)
10
+ # mae = mean_absolute_error(real_data, pred_data)
11
+ info["MSE of "+model_name] = mean_squared_error(real_data, pred_data)
12
+ # mse = mean_squared_error(real_data, pred_data)
13
+ info["RSME of "+model_name] = np.sqrt(info["MSE of "+model_name])
14
+ # rsme = np.sqrt(info["MSE of "+model_name])
15
+ info["R-Sqaure of "+model_name] = r2_score(real_data, pred_data)
16
+ # r2 = r2_score(real_data, pred_data)
17
+ if isinstance(max(real_data), np.ndarray):
18
+ info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
19
+ # ar2 = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
20
+ else:
21
+ info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
22
+ # ar2 = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
23
+
24
+ return info["Adjusted R-Square of " + model_name]
25
+
26
+
27
+ def calculate_regression_metrics(pred_data, real_data, model_name):
28
+ info = {}
29
+
30
+ info["MAE of "+model_name] = mean_absolute_error(real_data, pred_data)
31
+ # mae = mean_absolute_error(real_data, pred_data)
32
+ info["MSE of "+model_name] = mean_squared_error(real_data, pred_data)
33
+ # mse = mean_squared_error(real_data, pred_data)
34
+ info["RSME of "+model_name] = np.sqrt(info["MSE of "+model_name])
35
+ # rsme = np.sqrt(info["MSE of "+model_name])
36
+ info["R-Sqaure of "+model_name] = r2_score(real_data, pred_data)
37
+ # r2 = r2_score(real_data, pred_data)
38
+ if isinstance(max(real_data), np.ndarray):
39
+ info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
40
+ # ar2 = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
41
+ else:
42
+ info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
43
+ # ar2 = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
44
+
45
+ return info
46
+
47
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy~=1.23.5
2
+ pandas~=1.5.3
3
+ scikit-learn~=1.2.1
4
+ hmmlearn~=0.3.0
5
+ matplotlib~=3.7.0
6
+ scikit-fuzzy~=0.4.2
7
+ gradio~=4.17.0
8
+ shap~=0.44.1
9
+ networkx~=2.8.4
10
+ scipy~=1.10.0
11
+ xgboost~=2.0.3
12
+ tqdm~=4.64.1
static/__init__.py ADDED
File without changes
static/col.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_pca_col():
2
+ return [
3
+ "p1_momentum_value_better",
4
+ "elapsed_time",
5
+ "server",
6
+ "serve_no",
7
+ "p1_ace",
8
+ "p2_ace",
9
+ "p1_winner",
10
+ "p2_winner",
11
+ "winner_shot_type",
12
+ # "p1_double_fault",
13
+ "p2_double_fault",
14
+ "p1_unf_err",
15
+ "p2_unf_err",
16
+ "p1_net_pt",
17
+ "p2_net_pt",
18
+ "p1_net_pt_won",
19
+ "p2_net_pt_won",
20
+ "p1_break_pt",
21
+ "p2_break_pt",
22
+ "p1_break_pt_won",
23
+ "p2_break_pt_won",
24
+ "p1_break_pt_missed",
25
+ "p2_break_pt_missed",
26
+ "p1_distance_run",
27
+ "p2_distance_run",
28
+ "rally_count",
29
+ "speed_mph",
30
+ "serve_width",
31
+ "serve_depth",
32
+ "return_depth"
33
+ ]
34
+
35
+
36
+ def get_momentum_col(p):
37
+ return [
38
+ "point_victor",
39
+ "elapsed_time",
40
+ "server",
41
+ "serve_no",
42
+ "{}_ace".format(p),
43
+ # "p2_ace",
44
+ "{}_winner".format(p),
45
+ # "p2_winner",
46
+ "winner_shot_type",
47
+ # "p1_double_fault",
48
+ # "p2_double_fault",
49
+ "{}_unf_err".format(p),
50
+ # "p2_unf_err",
51
+ "{}_net_pt".format(p),
52
+ # "p2_net_pt",
53
+ "{}_net_pt_won".format(p),
54
+ # "p2_net_pt_won",
55
+ "{}_break_pt".format(p),
56
+ # "p2_break_pt",
57
+ "{}_break_pt_won".format(p),
58
+ # "p2_break_pt_won",
59
+ "{}_break_pt_missed".format(p),
60
+ # "p2_break_pt_missed",
61
+ "{}_distance_run".format(p),
62
+ # "p2_distance_run",
63
+ "rally_count",
64
+ "speed_mph",
65
+ "serve_width",
66
+ "serve_depth",
67
+ "return_depth"
68
+ ]
static/config.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Config:
2
+ # 随机种子
3
+ RANDOM_STATE = 123
4
+ # 绘图颜色组
5
+ COLORS = [
6
+ "#8074C8",
7
+ "#7895C1",
8
+ "#A8CBDF",
9
+ "#992224",
10
+ "#B54764",
11
+ "#E3625D",
12
+ "#EF8B67",
13
+ "#F0C284"
14
+ ]
15
+ COLORS_1 = [
16
+ "#91CCC0",
17
+ "#7FABD1",
18
+ "#F7AC53",
19
+ "#EC6E66",
20
+ "#B5CE4E",
21
+ "#BD7795",
22
+ "#B55384",
23
+ "#474769",
24
+ "#257D88",
25
+ "#ED8D5A",
26
+ "#BFDFD2",
27
+ "#EFCE87"
28
+ ]
29
+
30
+ COLORS_2 = [
31
+ "#A21A54",
32
+ "#E7724F",
33
+ "#32183C"
34
+ ]
35
+
36
+ COLORS_3 = [
37
+ "#ABD1BC",
38
+ "#CCCC99",
39
+ "#E3BBED"
40
+ ]
41
+
42
+
43
+ COLORS_4 = [
44
+ "#CFCFD0",
45
+ "#B6B3D6",
46
+ "#F58F7A",
47
+ "#E9687A",
48
+ ]
49
+
50
+ # 预测图展示的点个数
51
+ DISPLAY_RANGE = 100
static/process.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.model_selection import KFold
4
+ from sklearn import preprocessing
5
+ from sklearn.model_selection import GridSearchCV
6
+ from skopt import BayesSearchCV
7
+ import copy
8
+ import pandas as pd
9
+ from scipy.stats import spearmanr
10
+
11
+ from sklearn.datasets import load_iris
12
+ from sklearn.datasets import load_wine
13
+ from sklearn.datasets import load_breast_cancer
14
+ from scipy.linalg import eig
15
+
16
+ from static.config import Config
17
+
18
+
19
+ def match_split(df: pd.DataFrame):
20
+ return df.groupby("match_id")
21
+
22
+
23
+ # 斯皮尔曼秩相关系数
24
+ def calculate_spearmanr(x, y):
25
+ rho, p_value = spearmanr(x, y)
26
+
27
+ return rho, p_value
28
+
29
+
30
+ def calculate_remain_positive_points(df: pd.DataFrame):
31
+ # remain_positive距离无限远设置为len(df)
32
+
33
+ df["p1_remain_positive"] = 0
34
+ df["p2_remain_positive"] = 0
35
+ p1_zero_distance_list = []
36
+ p2_zero_distance_list = []
37
+
38
+ for i in range(1, len(df)):
39
+ if (df.loc[i, "p1_momentum_value_better"] > 0
40
+ and i != 0):
41
+ p1_zero_distance_list.append(i)
42
+ elif (df.loc[i, "p1_momentum_value_better"] < 0
43
+ and i != 0):
44
+ p2_zero_distance_list.append(i)
45
+
46
+ for j in range(len(df)):
47
+ for x in p1_zero_distance_list:
48
+ if j <= x:
49
+ df.loc[j, "p1_remain_positive"] = x - j
50
+ break
51
+ else:
52
+ continue
53
+
54
+ for j in range(len(df)):
55
+ for x in p2_zero_distance_list:
56
+ if j <= x:
57
+ df.loc[j, "p2_remain_positive"] = x - j
58
+ break
59
+ else:
60
+ continue
61
+
62
+ return df
63
+
64
+
65
+ def calculate_swing_point(df:pd.DataFrame):
66
+ # swing距离无限远设置为len(df)
67
+
68
+ df["swing"] = 0
69
+ zero_distance_list = []
70
+
71
+ for i in range(1, len(df)):
72
+ if (df.loc[i, "p1_momentum_value_better"] > 0 and df.loc[i-1, "p1_momentum_value_better"] < 0
73
+ and i != 0) or (df.loc[i, "p1_momentum_value_better"] < 0 and df.loc[i - 1, "p1_momentum_value_better"] > 0
74
+ and i != 0):
75
+ zero_distance_list.append(i)
76
+
77
+ for j in range(len(df)):
78
+ for x in zero_distance_list:
79
+ if j <= x:
80
+ df.loc[j, "swing"] = x - j
81
+ break
82
+ else:
83
+ continue
84
+
85
+ return df
86
+
87
+
88
+ def replace_na_to_label(df: pd.DataFrame):
89
+ return df.fillna("Not A Number")
90
+
91
+
92
+ def get_state_distribution(data):
93
+ # get the matrix of correlation coefficients
94
+ covX = np.around(np.corrcoef(data.T), decimals=3)
95
+
96
+ # draw_heat_map(covX, "related", False)
97
+
98
+ # Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
99
+ eigenvalues, eigenvectors = np.linalg.eig(covX.T)
100
+
101
+ eigenvalues = np.around(eigenvalues, decimals=3)
102
+
103
+ eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
104
+
105
+ # Sort feature values in descending order
106
+ eigenvalues = sorted(eigenvalues, reverse=True)
107
+
108
+ for i, value in enumerate(eigenvalues):
109
+ if i == 0:
110
+ sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
111
+ else:
112
+ sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
113
+
114
+ # draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
115
+
116
+ # get the contribution of the eigenvalues
117
+ contribution = eigenvalues / np.sum(eigenvalues)
118
+
119
+ return contribution
120
+
121
+
122
+ # 指数加权平均
123
+ def exponential_moving_average(df):
124
+ alpha = 0.3
125
+
126
+ ema = [df[0]]
127
+
128
+ for i in range(1, len(df)):
129
+ ema_value = alpha * df[i] + (1 - alpha) * ema[i-1]
130
+ ema.append(ema_value)
131
+
132
+ return ema
133
+
134
+
135
+ def need_to_mark_in_plot(df, col_name):
136
+ return df.where(df[col_name] == 1).dropna()
137
+
138
+
139
+ def point_victor_mapping(df):
140
+ mapping = {
141
+ 1: 0.0,
142
+ 2: 1.0
143
+ }
144
+ df["point_victor"] = df["point_victor"].map(mapping)
145
+
146
+ return df
147
+
148
+
149
+ def pick_matches_with_name(df, name):
150
+ df = df.where(df["match_id"] == name).dropna()
151
+
152
+ p1_name = df["player1"].iloc[0]
153
+ p2_name = df["player2"].iloc[0]
154
+
155
+ return df, p1_name, p2_name
156
+
157
+
158
+ def pick_matches_with_longest(df):
159
+ target_match_id = df.groupby("match_id").size().idxmax()
160
+
161
+ df = df.where(df["match_id"] == target_match_id).dropna()
162
+
163
+ p1_name = df["player1"].iloc[0]
164
+ p2_name = df["player2"].iloc[0]
165
+
166
+ return df, p1_name, p2_name
167
+
168
+
169
+ def choose_y_col_in_dataframe(df: pd.DataFrame, y_col: str):
170
+ y_data = df[y_col]
171
+ df.drop(y_col, axis=1, inplace=True)
172
+ df.insert(0, y_col, y_data)
173
+
174
+ return df
175
+
176
+
177
+ def load_data(sort):
178
+ if sort == "Iris Dataset":
179
+ sk_data = load_iris()
180
+ elif sort == "Wine Dataset":
181
+ sk_data = load_wine()
182
+ elif sort == "Breast Cancer Dataset":
183
+ sk_data = load_breast_cancer()
184
+
185
+ target_data = sk_data.target.astype(str)
186
+ for i in range(len(sk_data.target_names)):
187
+ target_data = np.where(target_data == str(i), sk_data.target_names[i], target_data)
188
+
189
+ sk_feature_names = sk_data.feature_names
190
+ sk_data = np.concatenate((target_data.reshape(-1, 1), sk_data.data), axis=1)
191
+ sk_feature_names = np.insert(sk_feature_names, 0, "species")
192
+
193
+ df = pd.DataFrame(data=sk_data, columns=sk_feature_names)
194
+
195
+ return df
196
+
197
+
198
+ def load_custom_data(file):
199
+ return pd.read_csv(file)
200
+
201
+
202
+ def preprocess_raw_data_filtering(df):
203
+ info = {}
204
+
205
+ len_0 = len(df)
206
+ info["Total size of raw data"] = len_0
207
+
208
+ # Delete the column "CUSTOMER_ID"
209
+ # df.drop("CUSTOMER_ID", axis=1, inplace=True)
210
+
211
+ # Remove duplicate data
212
+ df.drop_duplicates()
213
+ len_1 = len_0 - len(df)
214
+ info["Number of duplicates in the raw data"] = len_1
215
+
216
+ # Remove "nan" data
217
+ # df = remove_nan_from_data(df)
218
+ # len_2 = len_0 - len_1 - len(df)
219
+ # info["Number of nan in the raw data"] = len_2
220
+
221
+ info["Total size of filtered data after data preprocessing"] = len(df)
222
+
223
+ # Save the cleaned data to a csv format file
224
+ # df.to_csv("../data/filtered_data.csv", index=False)
225
+
226
+ return df, info
227
+
228
+
229
+ def remove_nan_from_data(df):
230
+ # Remove "nan" data
231
+ df.dropna(inplace=True)
232
+
233
+ return df
234
+
235
+
236
+ # Get standardized data
237
+ def get_standardized_data(df):
238
+ array = np.concatenate(((df.iloc[:, :1]).values, preprocessing.scale(df.iloc[:, 1:])), axis=1)
239
+
240
+ return array
241
+
242
+
243
+ def split_dataset(array):
244
+ x_train_and_validate, x_test, y_train_and_validate, y_test = train_test_split(
245
+ array[:, 1:],
246
+ array[:, :1],
247
+ random_state=Config.RANDOM_STATE,
248
+ train_size=0.8
249
+ )
250
+
251
+ return x_train_and_validate, x_test, y_train_and_validate, y_test
252
+
253
+
254
+ def k_fold_cross_validation_data_segmentation(x_train, y_train):
255
+ k = 5
256
+
257
+ train_data_array = np.concatenate((y_train, x_train), axis=1)
258
+
259
+ k_fold = KFold(n_splits=k, shuffle=True, random_state=Config.RANDOM_STATE)
260
+
261
+ train_data_list = []
262
+ validate_data_list = []
263
+ for train_index, validate_index in k_fold.split(train_data_array):
264
+ train_data_list.append(train_data_array[train_index])
265
+ validate_data_list.append(train_data_array[validate_index])
266
+
267
+ train_and_validate_data_list = []
268
+
269
+ for i in range(k):
270
+ train_and_validate_data_list.append((
271
+ train_data_list[i][:, 1:],
272
+ validate_data_list[i][:, 1:],
273
+ train_data_list[i][:, 0],
274
+ validate_data_list[i][:, 0]
275
+ ))
276
+
277
+ return train_and_validate_data_list
278
+
279
+
280
+ def grid_search(params, model, x_train, y_train, scoring=None):
281
+ info = {}
282
+
283
+ if scoring == "neg_mean_squared_error":
284
+ grid_search_model = GridSearchCV(model, params, cv=5, scoring="neg_mean_squared_error")
285
+ else:
286
+ grid_search_model = GridSearchCV(model, params, cv=5)
287
+
288
+ grid_search_model.fit(x_train, y_train.ravel())
289
+
290
+ info["Optimal hyperparameters"] = grid_search_model.best_params_
291
+
292
+ best_model = grid_search_model.best_estimator_
293
+
294
+ return best_model
295
+
296
+
297
+ def bayes_search(params, model, x_train, y_train, scoring=None):
298
+ info = {}
299
+
300
+ if scoring == "neg_mean_squared_error":
301
+ bayes_search_model = BayesSearchCV(model, params, cv=5, n_iter=50, scoring="neg_mean_squared_error")
302
+ else:
303
+ bayes_search_model = BayesSearchCV(model, params, cv=5, n_iter=50)
304
+
305
+ bayes_search_model.fit(x_train, y_train)
306
+
307
+ info["Optimal hyperparameters"] = bayes_search_model.best_params_
308
+
309
+ best_model = bayes_search_model.best_estimator_
310
+
311
+ return best_model
312
+
313
+
visualization/__init__.py ADDED
File without changes
visualization/draw_boxplot.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+
3
+ from coding.llh.static.config import Config
4
+
5
+
6
+ # draw boxplot
7
+ def draw_boxplot(x_data, title):
8
+ plt.figure(figsize=(10, 14))
9
+ plt.grid(True)
10
+
11
+ plt.boxplot(
12
+ x_data,
13
+ meanline=True,
14
+ showmeans=True,
15
+ medianprops={"color": Config.COLORS[0], "linewidth": 1.5},
16
+ meanprops={"color": Config.COLORS[1], "ls": "--", "linewidth": 1.5},
17
+ flierprops={"marker": "o", "markerfacecolor": Config.COLORS[2]},
18
+ labels=x_data.columns.values
19
+ )
20
+
21
+ plt.xticks(rotation=-45)
22
+ plt.title(title)
23
+
24
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
25
+
26
+ plt.show()
visualization/draw_heat_map.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+
5
+ from coding.llh.static.config import Config
6
+
7
+
8
+ # Draw heat map
9
+ def draw_heat_map(x_data, title, is_rotate, col_name):
10
+ # col_name = np.delete(col_name, np.where(col_name == "swing"))
11
+
12
+ plt.rcParams.update({'figure.autolayout': True})
13
+
14
+ plt.figure(figsize=(16, 16))
15
+
16
+ if isinstance(x_data, np.ndarray):
17
+ np_data = np.around(x_data.astype("float64"), 2)
18
+ pd_data = pd.DataFrame(x_data)
19
+ elif isinstance(x_data, pd.DataFrame):
20
+ np_data = np.around(x_data.to_numpy().astype("float64"), 2)
21
+ pd_data = x_data
22
+
23
+ for i in range(np_data.shape[0]):
24
+ for j in range(np_data.shape[1]):
25
+ plt.text(j, i, np_data[i, j], ha="center", va="center", color="w")
26
+
27
+ if is_rotate:
28
+ plt.xticks(np.arange(len(pd_data.columns.values)), col_name, rotation=-90)
29
+ else:
30
+ plt.xticks(np.arange(len(pd_data.columns.values)), col_name)
31
+
32
+ plt.yticks(np.arange(len(pd_data.index.values)), col_name)
33
+ plt.imshow(np_data)
34
+ # plt.colorbar(False)
35
+ plt.tight_layout()
36
+ # plt.title(title)
37
+
38
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
39
+
40
+ plt.show()
visualization/draw_histogram.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+ from coding.llh.static.config import Config
5
+
6
+
7
+ # Plot bar charts
8
+ def draw_histogram(x_data, y_data, will_rotate, will_show_text, title):
9
+ fig, ax = plt.subplots(figsize=(10, 8))
10
+
11
+ bars = plt.bar(
12
+ np.arange(0, len(x_data)),
13
+ x_data,
14
+ align="center",
15
+ alpha=1,
16
+ color=Config.COLORS,
17
+ tick_label=y_data
18
+ )
19
+
20
+ # Bar annotation
21
+ if will_show_text:
22
+ for bar in bars:
23
+ ax.annotate(
24
+ str(bar.get_height()),
25
+ xy=(bar.get_x() + bar.get_width() / 2,
26
+ bar.get_height()),
27
+ xytext=(0, 3),
28
+ textcoords="offset points",
29
+ va="bottom",
30
+ ha="center"
31
+ )
32
+
33
+ if will_rotate:
34
+ plt.xticks(rotation=-90)
35
+
36
+ plt.title(title)
37
+
38
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
39
+
40
+ plt.show()
visualization/draw_histogram_line_subgraph.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from matplotlib import pyplot as plt
3
+
4
+ from coding.llh.static.config import Config
5
+
6
+
7
+ def draw_histogram_line_subgraph(total_data_for_plot):
8
+ # Manually adjust based on the data
9
+ layout = """
10
+ ABC
11
+ DDE
12
+ FGH
13
+ IJK
14
+ """
15
+
16
+ fig, ax = plt.subplot_mosaic(layout, figsize=(16, 16))
17
+
18
+ for i, data in enumerate(total_data_for_plot):
19
+ if data[0] == "line_graph":
20
+ ax[str(chr(i+65))].grid()
21
+ ax[str(chr(i+65))].plot(
22
+ data[1],
23
+ data[2],
24
+ "-o",
25
+ color=Config.COLORS[0],
26
+ markersize=4
27
+ )
28
+ ax[str(chr(i+65))].set_title(data[3])
29
+ elif data[0] == "histogram":
30
+ ax[str(chr(i+65))].grid()
31
+ ax[str(chr(i+65))].bar(
32
+ np.arange(0, len(data[1])),
33
+ data[1],
34
+ align="center",
35
+ alpha=1,
36
+ color=Config.COLORS,
37
+ tick_label=data[2]
38
+ )
39
+
40
+ if data[3]:
41
+ ax[str(chr(i+65))].tick_params(axis='x', labelrotation=-90)
42
+
43
+ ax[str(chr(i+65))].set_title(data[5])
44
+
45
+ plt.tight_layout()
46
+ plt.savefig("./diagram/{}.png".format("total"), dpi=300)
47
+
48
+ plt.show()
visualization/draw_learning_curve.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from matplotlib import pyplot as plt
3
+
4
+ from static.config import Config
5
+
6
+
7
+ def draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std):
8
+ plt.figure(figsize=(10, 6))
9
+
10
+ plt.fill_between(
11
+ train_sizes,
12
+ train_scores_mean - train_scores_std,
13
+ train_scores_mean + train_scores_std,
14
+ alpha=0.1,
15
+ color=Config.COLORS[0]
16
+ )
17
+ plt.plot(
18
+ train_sizes,
19
+ train_scores_mean,
20
+ "o-",
21
+ color=Config.COLORS[0],
22
+ label="Training score"
23
+ )
24
+
25
+ plt.fill_between(
26
+ train_sizes,
27
+ test_scores_mean - test_scores_std,
28
+ test_scores_mean + test_scores_std,
29
+ alpha=0.1,
30
+ color=Config.COLORS[1]
31
+ )
32
+ plt.plot(
33
+ train_sizes,
34
+ test_scores_mean,
35
+ "o-",
36
+ color=Config.COLORS[1],
37
+ label="Cross-validation score"
38
+ )
39
+
40
+ plt.title("Learning curve")
41
+ plt.xlabel("Sizes")
42
+ plt.ylabel("Accuracy")
43
+ plt.legend(loc="best")
44
+ plt.show()
visualization/draw_learning_curve_total.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from matplotlib import pyplot as plt
3
+
4
+ from static.config import Config
5
+
6
+
7
+ def draw_learning_curve_total(input_dict, type):
8
+ plt.figure(figsize=(10, 6), dpi=300)
9
+
10
+ if type == "train":
11
+ i = 0
12
+ for label_name, values in input_dict.items():
13
+ train_sizes = values[0]
14
+ train_scores_mean = values[1]
15
+ train_scores_std = values[2]
16
+ test_scores_mean = values[3]
17
+ test_scores_std = values[4]
18
+
19
+ plt.fill_between(
20
+ train_sizes,
21
+ train_scores_mean - train_scores_std,
22
+ train_scores_mean + train_scores_std,
23
+ alpha=0.1,
24
+ color=Config.COLORS[i]
25
+ )
26
+
27
+ plt.plot(
28
+ train_sizes,
29
+ train_scores_mean,
30
+ "o-",
31
+ color=Config.COLORS[i],
32
+ label=label_name
33
+ )
34
+
35
+ i += 1
36
+
37
+ title = "Training Learning curve"
38
+ # plt.title(title)
39
+
40
+ else:
41
+ i = 0
42
+ for label_name, values in input_dict.items():
43
+ train_sizes = values[0]
44
+ train_scores_mean = values[1]
45
+ train_scores_std = values[2]
46
+ test_scores_mean = values[3]
47
+ test_scores_std = values[4]
48
+
49
+ plt.fill_between(
50
+ train_sizes,
51
+ test_scores_mean - test_scores_std,
52
+ test_scores_mean + test_scores_std,
53
+ alpha=0.1,
54
+ color=Config.COLORS[i]
55
+ )
56
+ plt.plot(
57
+ train_sizes,
58
+ test_scores_mean,
59
+ "o-",
60
+ color=Config.COLORS[i],
61
+ label=label_name
62
+ )
63
+
64
+ i += 1
65
+
66
+ title = "Cross-validation Learning curve"
67
+ # plt.title(title)
68
+
69
+ plt.xlabel("Sizes")
70
+ plt.ylabel("Adjusted R-square")
71
+ plt.legend()
72
+
73
+ # plt.savefig("./diagram/{}.png".format(title), dpi=300)
74
+ # plt.show()
75
+ return plt
76
+
visualization/draw_line_graph.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+ from static.config import Config
5
+
6
+
7
+ # draw line graph
8
+ def draw_line_graph(x_data, y_data: list, title):
9
+ plt.figure(figsize=(10, 8))
10
+
11
+ plt.plot(
12
+ x_data,
13
+ y_data,
14
+ "-o",
15
+ color=Config.COLORS[0]
16
+ )
17
+
18
+ plt.title(title)
19
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
20
+
21
+ plt.show()
22
+
23
+
24
+ def draw_line_graph_1(x_data, y_data: list, title, labels: list):
25
+ plt.figure(figsize=(10, 8))
26
+
27
+ for i, single_y_data in enumerate(y_data):
28
+ plt.plot(
29
+ x_data,
30
+ single_y_data,
31
+ "-o",
32
+ color=Config.COLORS[i],
33
+ label=labels[i]
34
+ )
35
+
36
+ plt.legend()
37
+ plt.title(title)
38
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
39
+
40
+ plt.show()
visualization/draw_momentum.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.metrics import *
4
+ from sklearn.preprocessing import label_binarize
5
+
6
+ from coding.llh.static.config import Config
7
+
8
+
9
+ def draw_momentum(df, p1_name, p2_name):
10
+ plt.figure(figsize=(10, 6))
11
+
12
+ plt.plot(
13
+ df.loc[:, "elapsed_time"].values,
14
+ df.loc[:, "p1_momentum_value"].values,
15
+ "-",
16
+ color=Config.COLORS_1[8],
17
+ alpha=0.5,
18
+ label=p1_name
19
+ )
20
+ plt.plot(
21
+ df.loc[:, "elapsed_time"].values,
22
+ df.loc[:, "p2_momentum_value"].values,
23
+ "-",
24
+ color=Config.COLORS_1[9],
25
+ alpha=0.5,
26
+ label=p2_name
27
+ )
28
+ plt.axhline(
29
+ y=0,
30
+ linestyle="--",
31
+ color="black",
32
+ alpha=0.5
33
+ )
34
+ plt.plot(
35
+ df.loc[:, "elapsed_time"].values,
36
+ df.loc[:, "p1_momentum_value_better"].values,
37
+ "-",
38
+ color=Config.COLORS_1[10],
39
+ alpha=0.7,
40
+ label="Degree of Superiority"
41
+ )
42
+
43
+ title = "Momentum"
44
+ # plt.title(title)
45
+
46
+ plt.xlabel("Elapsed time")
47
+ plt.ylabel("Momentum value")
48
+ plt.legend()
49
+
50
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
51
+
52
+ plt.show()
visualization/draw_parallel_coordinates.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+
4
+ from coding.llh.static.config import Config
5
+
6
+
7
+ def draw_parallel_coordinates(df):
8
+ df.drop("match_id", axis=1, inplace=True)
9
+ df.drop("player1", axis=1, inplace=True)
10
+ df.drop("player2", axis=1, inplace=True)
11
+ df.drop("elapsed_time", axis=1, inplace=True)
12
+ df.drop("set_no", axis=1, inplace=True)
13
+ df.drop("game_no", axis=1, inplace=True)
14
+ df.drop("point_no", axis=1, inplace=True)
15
+ df.drop("p1_sets", axis=1, inplace=True)
16
+ df.drop("p2_sets", axis=1, inplace=True)
17
+ df.drop("p1_games", axis=1, inplace=True)
18
+ df.drop("p2_games", axis=1, inplace=True)
19
+ df.drop("p1_points_won", axis=1, inplace=True)
20
+ df.drop("p2_points_won", axis=1, inplace=True)
21
+ df.drop("p1_distance_run", axis=1, inplace=True)
22
+ df.drop("p2_distance_run", axis=1, inplace=True)
23
+ df.drop("speed_mph", axis=1, inplace=True)
24
+ df.drop("p1_score_normal", axis=1, inplace=True)
25
+ df.drop("p2_score_normal", axis=1, inplace=True)
26
+ df.drop("p1_score_tiebreak", axis=1, inplace=True)
27
+ df.drop("p2_score_tiebreak", axis=1, inplace=True)
28
+ df.drop("p1_game_victor", axis=1, inplace=True)
29
+ df.drop("p2_game_victor", axis=1, inplace=True)
30
+ df.drop("p1_set_victor", axis=1, inplace=True)
31
+ df.drop("p2_set_victor", axis=1, inplace=True)
32
+
33
+ plt.figure(figsize=(10, 6))
34
+
35
+ pd.plotting.parallel_coordinates(df, "point_victor", colormap="viridis")
36
+
37
+ title = "Parallel Coordinates Plot"
38
+ plt.title(title)
39
+
40
+ plt.xlabel("Attributes")
41
+ plt.ylabel("Values")
42
+ plt.legend()
43
+
44
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
45
+
46
+ plt.show()
visualization/draw_play_flow.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.metrics import *
4
+ from sklearn.preprocessing import label_binarize
5
+
6
+ from coding.llh.static.config import Config
7
+
8
+
9
+ def draw_play_flow(df, p1_name, p2_name, p1_ace, p2_ace, p1_net_pt_won, p2_net_pt_won, p1_break_pt_won, p2_break_pt_won):
10
+ plt.figure(figsize=(10, 6))
11
+
12
+ plt.plot(
13
+ df.loc[:, "elapsed_time"].values,
14
+ df.loc[:, "p1_points_won"].values,
15
+ "-",
16
+ color=Config.COLORS_1[6],
17
+ alpha=0.5,
18
+ label=p1_name
19
+ )
20
+ plt.plot(
21
+ df.loc[:, "elapsed_time"].values,
22
+ df.loc[:, "p2_points_won"].values,
23
+ "-",
24
+ color=Config.COLORS_1[7],
25
+ alpha=0.5,
26
+ label=p2_name
27
+ )
28
+
29
+ plt.scatter(
30
+ p1_ace.loc[:, "elapsed_time"].values,
31
+ p1_ace.loc[:, "p1_points_won"].values,
32
+ s=40,
33
+ c=Config.COLORS_1[0],
34
+ marker="v",
35
+ label="p1_ace"
36
+ )
37
+ plt.scatter(
38
+ p2_ace.loc[:, "elapsed_time"].values,
39
+ p2_ace.loc[:, "p2_points_won"].values,
40
+ s=40,
41
+ c=Config.COLORS_1[1],
42
+ marker="v",
43
+ label="p2_ace"
44
+ )
45
+ plt.scatter(
46
+ p1_net_pt_won.loc[:, "elapsed_time"].values,
47
+ p1_net_pt_won.loc[:, "p1_points_won"].values,
48
+ s=40,
49
+ c=Config.COLORS_1[2],
50
+ marker="*",
51
+ label="p1_net_pt_won"
52
+ )
53
+ plt.scatter(
54
+ p2_net_pt_won.loc[:, "elapsed_time"].values,
55
+ p2_net_pt_won.loc[:, "p2_points_won"].values,
56
+ s=40,
57
+ c=Config.COLORS_1[3],
58
+ marker="*",
59
+ label="p2_net_pt_won"
60
+ )
61
+ plt.scatter(
62
+ p1_break_pt_won.loc[:, "elapsed_time"].values,
63
+ p1_break_pt_won.loc[:, "p1_points_won"].values,
64
+ s=40,
65
+ c=Config.COLORS_1[4],
66
+ marker="+",
67
+ label="p1_break_pt_won"
68
+ )
69
+ plt.scatter(
70
+ p2_break_pt_won.loc[:, "elapsed_time"].values,
71
+ p2_break_pt_won.loc[:, "p2_points_won"].values,
72
+ s=40,
73
+ c=Config.COLORS_1[5],
74
+ marker="+",
75
+ label="p1_break_pt_won"
76
+ )
77
+
78
+ title = "Flow of play"
79
+ # plt.title(title)
80
+
81
+ plt.xlabel("Elapsed time")
82
+ plt.ylabel("Points")
83
+ plt.legend()
84
+
85
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
86
+
87
+ plt.show()
visualization/draw_pred_total.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from matplotlib import pyplot as plt
3
+
4
+ from coding.llh.static.config import Config
5
+
6
+
7
+ def draw_pred_total(input_dict):
8
+ plt.figure(figsize=(10, 6))
9
+
10
+ i = 0
11
+ for name, cur_list in input_dict.items():
12
+ mylist = cur_list
13
+ plt.plot(
14
+ np.array([x for x in range(len(cur_list[0]))]),
15
+ cur_list[0],
16
+ "-",
17
+ color=Config.COLORS_4[i],
18
+ alpha=0.9,
19
+ label=name
20
+ )
21
+ i += 1
22
+
23
+ plt.plot(
24
+ np.array([x for x in range(len(mylist[1]))]),
25
+ mylist[1],
26
+ "--",
27
+ color=Config.COLORS_4[1],
28
+ alpha=0.9,
29
+ label="actual data"
30
+ )
31
+
32
+ title = "pred curve"
33
+
34
+ plt.xlabel("Sizes")
35
+ plt.ylabel("Value")
36
+ plt.legend()
37
+
38
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
39
+
40
+ plt.show()
41
+
42
+
visualization/draw_roc_auc_curve_total.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.metrics import *
4
+ from sklearn.preprocessing import label_binarize
5
+
6
+ from coding.llh.static.config import Config
7
+
8
+
9
+ def draw_roc_auc_curve_total(input_dict, type):
10
+ plt.figure(figsize=(10, 6))
11
+
12
+ if type == "train":
13
+ i = 0
14
+ for label_name, values in input_dict.items():
15
+ fpr = values[0]
16
+ tpr = values[1]
17
+ thresholds = values[2]
18
+
19
+ plt.plot(
20
+ fpr,
21
+ tpr,
22
+ "o-",
23
+ color=Config.COLORS[i],
24
+ label=label_name+str(round(auc(fpr, tpr), 2))
25
+ )
26
+
27
+ i += 1
28
+
29
+ title = "Training roc-auc curve"
30
+ plt.title(title)
31
+
32
+ else:
33
+ i = 0
34
+ for label_name, values in input_dict.items():
35
+ fpr = values[0]
36
+ tpr = values[1]
37
+ thresholds = values[2]
38
+
39
+ plt.plot(
40
+ fpr,
41
+ tpr,
42
+ "o-",
43
+ color=Config.COLORS[i],
44
+ label=label_name + str(round(auc(fpr, tpr), 2))
45
+ )
46
+
47
+ i += 1
48
+
49
+ title = "Cross-validation roc-auc curve"
50
+ plt.title(title)
51
+
52
+ plt.xlabel("tpr")
53
+ plt.ylabel("fpr")
54
+ plt.legend()
55
+
56
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
57
+
58
+ plt.show()
visualization/draw_scatter.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from mpl_toolkits.mplot3d import Axes3D
4
+
5
+ from coding.llh.static.config import Config
6
+
7
+
8
+ # Draw scatter
9
+ def draw_scatter_2D(x_data, y_data, centers, title):
10
+ num_clusters = np.unique(y_data)
11
+
12
+ plt.figure(figsize=(10, 8))
13
+
14
+ for i in range(len(num_clusters)):
15
+ plt.scatter(x_data[y_data == i][:, 0], x_data[y_data == i][:, 1], s=1)
16
+ for i in range(len(num_clusters)):
17
+ plt.scatter(centers[i, 0], centers[i, 1], marker="*", s=50, c="black")
18
+
19
+ plt.title(title)
20
+
21
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
22
+
23
+ plt.show()
24
+
25
+
26
+ def draw_scatter_2D_1(x_data, title):
27
+ plt.figure(figsize=(10, 8))
28
+
29
+ plt.scatter(x_data[:, 0], x_data[:, 1], s=1)
30
+
31
+ plt.title(title)
32
+
33
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
34
+
35
+ plt.show()
36
+
37
+
38
+ def draw_scatter_3D(x_data, y_data, centers, title):
39
+ num_clusters = np.unique(y_data)
40
+
41
+ fig = plt.figure(figsize=(10, 8))
42
+
43
+ ax = Axes3D(fig)
44
+ fig.add_axes(ax)
45
+
46
+ for i in range(len(num_clusters)):
47
+ ax.scatter(x_data[y_data == i][:, 0], x_data[y_data == i][:, 1], x_data[y_data == i][:, 2], s=1)
48
+ for i in range(len(num_clusters)):
49
+ ax.scatter(centers[i, 0], centers[i, 1], centers[i, 2], marker="*", s=50, c="black")
50
+
51
+ plt.title(title)
52
+
53
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
54
+
55
+ plt.show()
56
+
57
+
58
+ def draw_scatter_3D_1(x_data, title):
59
+ fig = plt.figure(figsize=(10, 8))
60
+
61
+ ax = Axes3D(fig)
62
+ fig.add_axes(ax)
63
+
64
+ ax.scatter(x_data[:, 0], x_data[:, 1], x_data[:, 2], s=1)
65
+
66
+ plt.title(title)
67
+
68
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
69
+
70
+ plt.show()
visualization/draw_scatter_line_graph.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+ from coding.llh.static.config import Config
5
+
6
+
7
+ # draw scatter line graph
8
+ def draw_scatter_line_graph(x_data, y_pred_data, y_real_data, coef, intercept, labels, title):
9
+ # Manually adjust based on the data
10
+ layout = """
11
+ ABCDE
12
+ FGHIJ
13
+ """
14
+
15
+ fig, ax = plt.subplot_mosaic(layout, figsize=(16, 16))
16
+
17
+ for i in range(np.size(x_data, 1)):
18
+ ax[str(chr(i+65))].scatter(x_data[:, i], y_pred_data.T, color=Config.COLORS[0], s=4, label=labels[0])
19
+ ax[str(chr(i+65))].scatter(x_data[:, i], y_real_data, color=Config.COLORS[1], s=4, label=labels[1])
20
+ ax[str(chr(i+65))].plot(x_data[:, i], x_data[:, i] * coef[i] + intercept, color=Config.COLORS[2], markersize=4)
21
+ ax[str(chr(i + 65))].legend()
22
+
23
+ plt.suptitle(title)
24
+
25
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
26
+
27
+ plt.show()
visualization/draw_swings_and_positives.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.metrics import *
4
+ from sklearn.preprocessing import label_binarize
5
+
6
+ from coding.llh.static.config import Config
7
+
8
+
9
+ def draw_swings_and_positives(df, p1_name, p2_name):
10
+ plt.figure(figsize=(10, 6))
11
+
12
+ plt.plot(
13
+ df.loc[:, "elapsed_time"].values,
14
+ df.loc[:, "swing"].values,
15
+ "-",
16
+ color=Config.COLORS_2[2],
17
+ alpha=0.7,
18
+ label="Swing of Play"
19
+ )
20
+ plt.plot(
21
+ df.loc[:, "elapsed_time"].values,
22
+ df.loc[:, "p1_remain_positive"].values,
23
+ "-.",
24
+ color=Config.COLORS_2[0],
25
+ alpha=0.7,
26
+ label=p1_name
27
+ )
28
+ plt.plot(
29
+ df.loc[:, "elapsed_time"].values,
30
+ df.loc[:, "p2_remain_positive"].values,
31
+ "-.",
32
+ color=Config.COLORS_2[1],
33
+ alpha=0.7,
34
+ label=p2_name
35
+ )
36
+
37
+ title = "Standard time interval"
38
+ # plt.title(title)
39
+
40
+ plt.xlabel("Elapsed time")
41
+ plt.ylabel("Standard time interval")
42
+ plt.legend()
43
+
44
+ plt.savefig("./diagram/{}.png".format(title), dpi=300)
45
+
46
+ plt.show()