Spaces:
Sleeping
Sleeping
LLH
commited on
Commit
·
bd39f54
1
Parent(s):
a1a414a
2024/02/14/01:14
Browse files- .idea/.gitignore +8 -0
- LICENSE +201 -0
- README.md +1 -13
- analysis/__init__.py +0 -0
- analysis/bayes_model.py +28 -0
- analysis/descriptive_analysis.py +304 -0
- analysis/evaluation_model.py +99 -0
- analysis/exploratory_analysis.py +130 -0
- analysis/gaussian_model.py +28 -0
- analysis/gradient_model.py +72 -0
- analysis/kernel_model.py +97 -0
- analysis/linear_model.py +194 -0
- analysis/markov_model.py +98 -0
- analysis/my_learning_curve.py +33 -0
- analysis/neural_model.py +321 -0
- analysis/poly_model.py +12 -0
- analysis/shap_model.py +16 -0
- analysis/tree_model.py +208 -0
- analysis/two_exponential_smoothing_model.py +48 -0
- app.py +848 -0
- metrics/__init__.py +0 -0
- metrics/calculate_classification_metrics.py +35 -0
- metrics/calculate_regression_metrics.py +47 -0
- requirements.txt +12 -0
- static/__init__.py +0 -0
- static/col.py +68 -0
- static/config.py +51 -0
- static/process.py +313 -0
- visualization/__init__.py +0 -0
- visualization/draw_boxplot.py +26 -0
- visualization/draw_heat_map.py +40 -0
- visualization/draw_histogram.py +40 -0
- visualization/draw_histogram_line_subgraph.py +48 -0
- visualization/draw_learning_curve.py +44 -0
- visualization/draw_learning_curve_total.py +76 -0
- visualization/draw_line_graph.py +40 -0
- visualization/draw_momentum.py +52 -0
- visualization/draw_parallel_coordinates.py +46 -0
- visualization/draw_play_flow.py +87 -0
- visualization/draw_pred_total.py +42 -0
- visualization/draw_roc_auc_curve_total.py +58 -0
- visualization/draw_scatter.py +70 -0
- visualization/draw_scatter_line_graph.py +27 -0
- visualization/draw_swings_and_positives.py +46 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
title: EasyMachineLearningDemo
|
3 |
-
emoji: 🔥
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.18.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# EasyMachineLearning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analysis/__init__.py
ADDED
File without changes
|
analysis/bayes_model.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.naive_bayes import *
|
2 |
+
|
3 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
4 |
+
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
5 |
+
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
6 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
7 |
+
|
8 |
+
|
9 |
+
# Naive bayes classification
|
10 |
+
def naive_bayes_classification(x_train, y_train, x_test, y_test):
|
11 |
+
info = {}
|
12 |
+
|
13 |
+
# multinomial_naive_bayes_classification_model = MultinomialNB()
|
14 |
+
Gaussian_naive_bayes_classification_model = GaussianNB()
|
15 |
+
# bernoulli_naive_bayes_classification_model = BernoulliNB()
|
16 |
+
# complement_naive_bayes_classification_model = ComplementNB()
|
17 |
+
|
18 |
+
Gaussian_naive_bayes_classification_model.fit(x_train, y_train)
|
19 |
+
|
20 |
+
y_pred = Gaussian_naive_bayes_classification_model.predict(x_test).reshape(-1, 1)
|
21 |
+
|
22 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "Gaussian naive bayes classification model residual plot")
|
23 |
+
|
24 |
+
info.update(calculate_regression_metrics(y_pred, y_test, "Gaussian naive bayes classification"))
|
25 |
+
info.update(calculate_classification_metrics(y_pred, y_test, "Gaussian naive bayes classification"))
|
26 |
+
|
27 |
+
return info
|
28 |
+
|
analysis/descriptive_analysis.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
import json
|
5 |
+
import sys
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import math
|
9 |
+
import time as sys_time
|
10 |
+
|
11 |
+
from coding.llh.visualization.draw_boxplot import draw_boxplot
|
12 |
+
from coding.llh.visualization.draw_heat_map import draw_heat_map
|
13 |
+
from coding.llh.visualization.draw_histogram import draw_histogram
|
14 |
+
from coding.llh.visualization.draw_histogram_line_subgraph import draw_histogram_line_subgraph
|
15 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
16 |
+
from tqdm import tqdm
|
17 |
+
|
18 |
+
|
19 |
+
# 0202:
|
20 |
+
def data_transformation_extra(df: pd.DataFrame, str2int_mappings: dict) -> (pd.DataFrame):
|
21 |
+
|
22 |
+
# Delete "match_id" column
|
23 |
+
# df.drop("match_id", axis=1, inplace=True)
|
24 |
+
df["match_id"] = df["match_id"].apply(lambda x: x[-4:])
|
25 |
+
|
26 |
+
# Dissolve the two-mode data mapping into two part
|
27 |
+
|
28 |
+
value_to_replace_dict = {
|
29 |
+
"AD": "50"
|
30 |
+
}
|
31 |
+
|
32 |
+
value_to_replace = "AD"
|
33 |
+
df["p1_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
|
34 |
+
df["p2_score"].replace(value_to_replace, value_to_replace_dict[value_to_replace], inplace=True)
|
35 |
+
|
36 |
+
str2int_mappings_to_dissolve = {
|
37 |
+
"p1_score": {"0": 0},
|
38 |
+
"p2_score": {"0": 0}
|
39 |
+
}
|
40 |
+
|
41 |
+
df["p1_score_mark"] = 0
|
42 |
+
df["p2_score_mark"] = 0
|
43 |
+
|
44 |
+
for key in str2int_mappings_to_dissolve.keys():
|
45 |
+
for i in range(1, len(df)):
|
46 |
+
if df.loc[i, key] == "15" and df.loc[i-1, key] == "0":
|
47 |
+
df.loc[i, key+"_mark"] = 1
|
48 |
+
elif df.loc[i, key] == "1" and df.loc[i-1, key] == "0":
|
49 |
+
df.loc[i, key + "_mark"] = 2
|
50 |
+
|
51 |
+
df["p1_score_normal"] = 0
|
52 |
+
df["p1_score_tiebreak"] = 0
|
53 |
+
df["p2_score_normal"] = 0
|
54 |
+
df["p2_score_tiebreak"] = 0
|
55 |
+
|
56 |
+
normal_counter = 0
|
57 |
+
tiebreak_counter = 0
|
58 |
+
for key in str2int_mappings_to_dissolve.keys():
|
59 |
+
for i in range(0, len(df)):
|
60 |
+
if df.loc[i, key] == "0":
|
61 |
+
normal_counter = 0
|
62 |
+
tiebreak_counter = 0
|
63 |
+
continue
|
64 |
+
|
65 |
+
if df.loc[i, key+"_mark"] == 1 or normal_counter > 0:
|
66 |
+
if int(df.loc[i, key]) > int(df.loc[i-1, key]):
|
67 |
+
normal_counter += 1
|
68 |
+
df.loc[i, key + "_normal"] = normal_counter
|
69 |
+
if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
|
70 |
+
str2int_mappings_to_dissolve[key][value_to_replace] = normal_counter
|
71 |
+
else:
|
72 |
+
str2int_mappings_to_dissolve[key][df.loc[i, key]] = normal_counter
|
73 |
+
|
74 |
+
elif int(df.loc[i, key]) < int(df.loc[i-1, key]):
|
75 |
+
normal_counter -= 1
|
76 |
+
df.loc[i, key + "_normal"] = normal_counter
|
77 |
+
|
78 |
+
else:
|
79 |
+
df.loc[i, key + "_normal"] = normal_counter
|
80 |
+
|
81 |
+
elif df.loc[i, key+"_mark"] == 2 or tiebreak_counter > 0:
|
82 |
+
if int(df.loc[i, key]) > int(df.loc[i - 1, key]):
|
83 |
+
tiebreak_counter += 1
|
84 |
+
df.loc[i, key+"_tiebreak"] = tiebreak_counter
|
85 |
+
if df.loc[i, key] == value_to_replace_dict[value_to_replace]:
|
86 |
+
str2int_mappings_to_dissolve[key][value_to_replace] = tiebreak_counter
|
87 |
+
else:
|
88 |
+
str2int_mappings_to_dissolve[key][df.loc[i, key]] = tiebreak_counter
|
89 |
+
|
90 |
+
elif int(df.loc[i, key]) < int(df.loc[i - 1, key]):
|
91 |
+
tiebreak_counter -= 1
|
92 |
+
df.loc[i, key+"_tiebreak"] = tiebreak_counter
|
93 |
+
|
94 |
+
else:
|
95 |
+
df.loc[i, key + "_tiebreak"] = tiebreak_counter
|
96 |
+
|
97 |
+
str2int_mappings.update(str2int_mappings_to_dissolve)
|
98 |
+
|
99 |
+
df.drop("p1_score_mark", axis=1, inplace=True)
|
100 |
+
df.drop("p2_score_mark", axis=1, inplace=True)
|
101 |
+
df.drop("p1_score", axis=1, inplace=True)
|
102 |
+
df.drop("p2_score", axis=1, inplace=True)
|
103 |
+
|
104 |
+
# Transform "elapsed_time" time column
|
105 |
+
|
106 |
+
def transform_time_col(time: str):
|
107 |
+
h, m, s = time.strip().split(":")
|
108 |
+
seconds = int(h) * 3600 + int(m) * 60 + int(s)
|
109 |
+
return seconds
|
110 |
+
|
111 |
+
df["elapsed_time"] = df["elapsed_time"].apply(transform_time_col)
|
112 |
+
|
113 |
+
# Calculate "game_victor", "set_victor" column cumulative value
|
114 |
+
|
115 |
+
df["p1_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 1 else 0, axis=1)
|
116 |
+
df["p2_game_victor"] = df.apply(lambda x: 1 if x["game_victor"] == 2 else 0, axis=1)
|
117 |
+
df["p1_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 1 else 0, axis=1)
|
118 |
+
df["p2_set_victor"] = df.apply(lambda x: 1 if x["set_victor"] == 2 else 0, axis=1)
|
119 |
+
|
120 |
+
df["p1_game_victor"] = df.groupby(["player1", "player2"])["p1_game_victor"].cumsum()
|
121 |
+
df["p2_game_victor"] = df.groupby(["player1", "player2"])["p2_game_victor"].cumsum()
|
122 |
+
df["p1_set_victor"] = df.groupby(["player1", "player2"])["p1_set_victor"].cumsum()
|
123 |
+
df["p2_set_victor"] = df.groupby(["player1", "player2"])["p2_set_victor"].cumsum()
|
124 |
+
|
125 |
+
# Forced conversion of data types
|
126 |
+
for col in df.columns.values:
|
127 |
+
df[col] = df[col].astype("float")
|
128 |
+
|
129 |
+
# Save the mappings to a json format file
|
130 |
+
with open("./data/mappings.json", "w", encoding="utf-8") as f:
|
131 |
+
json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
|
132 |
+
|
133 |
+
return df
|
134 |
+
|
135 |
+
|
136 |
+
def data_transformation(df: pd.DataFrame) -> (pd.DataFrame, dict):
|
137 |
+
"""
|
138 |
+
0.
|
139 |
+
1. Define mappings
|
140 |
+
2. Create mappings
|
141 |
+
3. Modify the original data according to the mappings
|
142 |
+
4. Get type exception
|
143 |
+
5. Forced conversion of data types
|
144 |
+
"""
|
145 |
+
|
146 |
+
info = {}
|
147 |
+
|
148 |
+
# Define mappings
|
149 |
+
str2int_mappings = {
|
150 |
+
"player1": {},
|
151 |
+
"player2": {},
|
152 |
+
"winner_shot_type": {},
|
153 |
+
"serve_width": {},
|
154 |
+
"serve_depth": {},
|
155 |
+
"return_depth": {}
|
156 |
+
}
|
157 |
+
|
158 |
+
# Create mappings
|
159 |
+
for col in str2int_mappings.copy():
|
160 |
+
keys = np.array(df[col].drop_duplicates())
|
161 |
+
values = [x for x in range(len(keys))]
|
162 |
+
str2int_mappings[col] = dict(zip(keys, values))
|
163 |
+
|
164 |
+
# Modify the original data according to the mappings
|
165 |
+
for col, mapping in str2int_mappings.items():
|
166 |
+
series = df[col]
|
167 |
+
|
168 |
+
for k, v in mapping.items():
|
169 |
+
series.replace(k, v, inplace=True)
|
170 |
+
df[col] = series
|
171 |
+
|
172 |
+
df.replace('Not A Number', 0, inplace=True)
|
173 |
+
|
174 |
+
# Get type exception
|
175 |
+
|
176 |
+
# abnormal_type_values = []
|
177 |
+
#
|
178 |
+
# for col in df.columns.values:
|
179 |
+
# if col not in str2int_mappings.keys():
|
180 |
+
# for row in df[col]:
|
181 |
+
|
182 |
+
# if not (0 <= row <= sys.maxsize):
|
183 |
+
# abnormal_type_values.append(row)
|
184 |
+
#
|
185 |
+
# info["Number of abnormal type value"] = sorted(abnormal_type_values)
|
186 |
+
|
187 |
+
|
188 |
+
# # Forced conversion of data types
|
189 |
+
# for col in df.columns.values:
|
190 |
+
# df[col] = df[col].astype("float")
|
191 |
+
#
|
192 |
+
# # Save the mappings to a json format file
|
193 |
+
# with open("./mappings.json", "w", encoding="utf-8") as f:
|
194 |
+
# json.dump(str2int_mappings, f, indent=4, ensure_ascii=False)
|
195 |
+
|
196 |
+
|
197 |
+
# 0202:
|
198 |
+
df = data_transformation_extra(df, str2int_mappings)
|
199 |
+
|
200 |
+
return df, info
|
201 |
+
|
202 |
+
|
203 |
+
# Get descriptive indicators and filtered data based on boxplpot
|
204 |
+
def get_descriptive_indicators_related(df):
|
205 |
+
info = {}
|
206 |
+
|
207 |
+
descriptive_indicators_df = pd.DataFrame(
|
208 |
+
index=list(df.columns.values),
|
209 |
+
columns=[
|
210 |
+
"Min",
|
211 |
+
"Max",
|
212 |
+
"Avg",
|
213 |
+
"Standard Deviation",
|
214 |
+
"Standard Error",
|
215 |
+
"Upper Quartile",
|
216 |
+
"Median",
|
217 |
+
"Lower Quartile",
|
218 |
+
"Interquartile Distance",
|
219 |
+
"Kurtosis",
|
220 |
+
"Skewness",
|
221 |
+
"Coefficient of Variation"
|
222 |
+
]
|
223 |
+
)
|
224 |
+
|
225 |
+
for col in df.columns.values:
|
226 |
+
descriptive_indicators_df["Min"][col] = df[col].min()
|
227 |
+
descriptive_indicators_df["Max"][col] = df[col].max()
|
228 |
+
descriptive_indicators_df["Avg"][col] = df[col].mean()
|
229 |
+
descriptive_indicators_df["Standard Deviation"][col] = df[col].std()
|
230 |
+
descriptive_indicators_df["Standard Error"][col] = descriptive_indicators_df["Standard Deviation"][col] / \
|
231 |
+
math.sqrt(len(df[col]))
|
232 |
+
descriptive_indicators_df["Upper Quartile"][col] = df[col].quantile(0.75)
|
233 |
+
descriptive_indicators_df["Median"][col] = df[col].quantile(0.5)
|
234 |
+
descriptive_indicators_df["Lower Quartile"][col] = df[col].quantile(0.25)
|
235 |
+
descriptive_indicators_df["Interquartile Distance"][col] = descriptive_indicators_df["Lower Quartile"][col] - \
|
236 |
+
descriptive_indicators_df["Upper Quartile"][col]
|
237 |
+
descriptive_indicators_df["Kurtosis"][col] = df[col].kurt()
|
238 |
+
descriptive_indicators_df["Skewness"][col] = df[col].skew()
|
239 |
+
descriptive_indicators_df["Coefficient of Variation"][col] = descriptive_indicators_df["Standard Deviation"][
|
240 |
+
col] \
|
241 |
+
/ descriptive_indicators_df["Avg"][col]
|
242 |
+
|
243 |
+
# draw_heat_map(descriptive_indicators_df.to_numpy(), "descriptive indicators", True)
|
244 |
+
#
|
245 |
+
# draw_boxplot(df, "descriptive indicators boxplot")
|
246 |
+
|
247 |
+
len_0 = len(df)
|
248 |
+
|
249 |
+
# tmp_df = \
|
250 |
+
# df[(df >= (descriptive_indicators_df["Lower Quartile"] - 1.5 * (descriptive_indicators_df["Upper Quartile"] -
|
251 |
+
# descriptive_indicators_df["Lower Quartile"])))
|
252 |
+
# & (df <= (descriptive_indicators_df["Upper Quartile"] + 1.5 * (descriptive_indicators_df["Upper Quartile"] -
|
253 |
+
# descriptive_indicators_df["Lower Quartile"])))][[
|
254 |
+
# "ProductChoice", "MembershipPoints", "ModeOfPayment", "ResidentCity", "PurchaseTenure", "IncomeClass",
|
255 |
+
# "CustomerPropensity", "CustomerAge", "LastPurchaseDuration"
|
256 |
+
# ]]
|
257 |
+
|
258 |
+
# tmp_df.dropna(inplace=True)
|
259 |
+
|
260 |
+
# df = pd.concat([tmp_df, df[["ProductChoice", "Channel", "MartialStatus"]]], axis=1, join="inner")
|
261 |
+
|
262 |
+
# df = pd.concat([df.iloc[:, :9], df.iloc[:, 10:]], axis=1)
|
263 |
+
|
264 |
+
# info["Number of offsetting value"] = len_0 - len(df)
|
265 |
+
#
|
266 |
+
# info["Total size of filtered data after descriptive analysis"] = len(df)
|
267 |
+
|
268 |
+
return df, info
|
269 |
+
|
270 |
+
|
271 |
+
# Create images of the distribution of the number of each variable
|
272 |
+
def variable_distribution(df):
|
273 |
+
counts_mappings = {}
|
274 |
+
print("counts analysis")
|
275 |
+
for col in tqdm(df.columns.values, desc='columns:'):
|
276 |
+
counts_mapping = {}
|
277 |
+
for x in tqdm(df[col], desc='cells'):
|
278 |
+
if x in counts_mapping.keys():
|
279 |
+
counts_mapping[x] += 1
|
280 |
+
else:
|
281 |
+
counts_mapping[x] = 1
|
282 |
+
counts_mappings[col] = counts_mapping
|
283 |
+
|
284 |
+
total_data_for_plot = []
|
285 |
+
print("plotting")
|
286 |
+
for col, mapping in tqdm(counts_mappings.items(), desc='columns'):
|
287 |
+
if col in ["set_no", 'game_no']:
|
288 |
+
sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[0])
|
289 |
+
data = [x[1] for x in sorting]
|
290 |
+
labels = [x[0] for x in sorting]
|
291 |
+
|
292 |
+
total_data_for_plot.append(["line_graph", labels, data, col])
|
293 |
+
draw_line_graph(labels, data, col)
|
294 |
+
else:
|
295 |
+
sorting = sorted(mapping.items(), reverse=True, key=lambda m: m[1])
|
296 |
+
data = [x[1] for x in sorting]
|
297 |
+
labels = [x[0] for x in sorting]
|
298 |
+
|
299 |
+
will_rotate = True if col in ["player1","player2", "match_id"] else False
|
300 |
+
will_show_text = False if col in ["ResidentCity"] else True
|
301 |
+
|
302 |
+
total_data_for_plot.append(["histogram", data, labels, will_rotate, will_show_text, col])
|
303 |
+
draw_histogram(data, labels, will_rotate, will_show_text, col)
|
304 |
+
# draw_histogram_line_subgraph(total_data_for_plot)
|
analysis/evaluation_model.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import skfuzzy as fuzz
|
3 |
+
from skfuzzy import control as ctrl
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
|
6 |
+
|
7 |
+
def fuzzy_comprehensive_evaluation_model():
|
8 |
+
# 创建模糊变量和模糊集合
|
9 |
+
technical_skill = ctrl.Antecedent(np.arange(0, 101, 1), 'technical_skill')
|
10 |
+
physical_condition = ctrl.Antecedent(np.arange(0, 101, 1), 'physical_condition')
|
11 |
+
mental_toughness = ctrl.Antecedent(np.arange(0, 101, 1), 'mental_toughness')
|
12 |
+
opponent_strength = ctrl.Antecedent(np.arange(0, 101, 1), 'opponent_strength')
|
13 |
+
|
14 |
+
performance = ctrl.Consequent(np.arange(0, 101, 1), 'performance')
|
15 |
+
|
16 |
+
# 设定模糊隶属度函数
|
17 |
+
technical_skill['low'] = fuzz.trimf(technical_skill.universe, [0, 0, 50])
|
18 |
+
technical_skill['medium'] = fuzz.trimf(technical_skill.universe, [0, 50, 100])
|
19 |
+
technical_skill['high'] = fuzz.trimf(technical_skill.universe, [50, 100, 100])
|
20 |
+
|
21 |
+
physical_condition['low'] = fuzz.trimf(physical_condition.universe, [0, 0, 50])
|
22 |
+
physical_condition['medium'] = fuzz.trimf(physical_condition.universe, [0, 50, 100])
|
23 |
+
physical_condition['high'] = fuzz.trimf(physical_condition.universe, [50, 100, 100])
|
24 |
+
|
25 |
+
mental_toughness['low'] = fuzz.trimf(mental_toughness.universe, [0, 0, 50])
|
26 |
+
mental_toughness['medium'] = fuzz.trimf(mental_toughness.universe, [0, 50, 100])
|
27 |
+
mental_toughness['high'] = fuzz.trimf(mental_toughness.universe, [50, 100, 100])
|
28 |
+
|
29 |
+
opponent_strength['low'] = fuzz.trimf(opponent_strength.universe, [0, 0, 50])
|
30 |
+
opponent_strength['medium'] = fuzz.trimf(opponent_strength.universe, [0, 50, 100])
|
31 |
+
opponent_strength['high'] = fuzz.trimf(opponent_strength.universe, [50, 100, 100])
|
32 |
+
|
33 |
+
performance['poor'] = fuzz.trimf(performance.universe, [0, 0, 50])
|
34 |
+
performance['average'] = fuzz.trimf(performance.universe, [0, 50, 100])
|
35 |
+
performance['excellent'] = fuzz.trimf(performance.universe, [50, 100, 100])
|
36 |
+
|
37 |
+
# 设定输出的解模糊方法——质心解模糊方式
|
38 |
+
performance.defuzzify_method = 'centroid'
|
39 |
+
|
40 |
+
# 设定规则
|
41 |
+
rule1 = ctrl.Rule(
|
42 |
+
technical_skill['low'] | physical_condition['low'] | mental_toughness['low'] | opponent_strength['low'],
|
43 |
+
performance['poor']
|
44 |
+
)
|
45 |
+
rule2 = ctrl.Rule(
|
46 |
+
technical_skill['medium'] | physical_condition['medium'] | mental_toughness['medium'] | opponent_strength['medium'],
|
47 |
+
performance['average']
|
48 |
+
)
|
49 |
+
rule3 = ctrl.Rule(
|
50 |
+
technical_skill['high'] | physical_condition['high'] | mental_toughness['high'] | opponent_strength['high'],
|
51 |
+
performance['excellent']
|
52 |
+
)
|
53 |
+
|
54 |
+
# 创建控制系统
|
55 |
+
performance_evaluation = ctrl.ControlSystem([rule1, rule2, rule3])
|
56 |
+
performance_evaluator = ctrl.ControlSystemSimulation(performance_evaluation)
|
57 |
+
|
58 |
+
# 输入数据
|
59 |
+
performance_evaluator.input['technical_skill'] = 75
|
60 |
+
performance_evaluator.input['physical_condition'] = 80
|
61 |
+
performance_evaluator.input['mental_toughness'] = 85
|
62 |
+
performance_evaluator.input['opponent_strength'] = 60
|
63 |
+
|
64 |
+
# 计算模糊综合评分
|
65 |
+
performance_evaluator.compute()
|
66 |
+
|
67 |
+
# 输出结果
|
68 |
+
print("模糊综合评分:", performance_evaluator.output['performance'])
|
69 |
+
|
70 |
+
# 打印模糊集合的可视化图表
|
71 |
+
technical_skill.view("technical_skill", sim=performance_evaluator)
|
72 |
+
physical_condition.view("physical_condition", sim=performance_evaluator)
|
73 |
+
mental_toughness.view("mental_toughness", sim=performance_evaluator)
|
74 |
+
opponent_strength.view("opponent_strength", sim=performance_evaluator)
|
75 |
+
performance.view("performance", sim=performance_evaluator)
|
76 |
+
|
77 |
+
# Perform sensitivity analyze (to change input value)
|
78 |
+
|
79 |
+
# input_var_1:
|
80 |
+
|
81 |
+
# input_values = np.arange(0, 11, 1)
|
82 |
+
# output_values = []
|
83 |
+
#
|
84 |
+
# for val in input_values:
|
85 |
+
# fuzzy_control_sys_simulation.input["input_var_1"] = val
|
86 |
+
# fuzzy_control_sys_simulation.compute()
|
87 |
+
# output_values.append(fuzzy_control_sys_simulation.output["output_var"])
|
88 |
+
#
|
89 |
+
# plt.plot(
|
90 |
+
# input_values,
|
91 |
+
# output_values,
|
92 |
+
# label="Sensitivity Analysis"
|
93 |
+
# )
|
94 |
+
# plt.xlabel("Input Variable 1")
|
95 |
+
# plt.ylabel("Output Variable")
|
96 |
+
# plt.legend()
|
97 |
+
# plt.show()
|
98 |
+
#
|
99 |
+
# return fuzzy_control_sys_simulation.output["output_var"]
|
analysis/exploratory_analysis.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import sklearn.metrics
|
3 |
+
from sklearn.cluster import KMeans
|
4 |
+
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
|
5 |
+
from factor_analyzer.factor_analyzer import calculate_kmo
|
6 |
+
|
7 |
+
from coding.llh.visualization.draw_heat_map import draw_heat_map
|
8 |
+
from coding.llh.visualization.draw_scatter import draw_scatter_2D, draw_scatter_2D_1, draw_scatter_3D_1, draw_scatter_3D
|
9 |
+
|
10 |
+
|
11 |
+
# K-means
|
12 |
+
def k_means(array: np.ndarray):
|
13 |
+
info = {}
|
14 |
+
|
15 |
+
draw_scatter_2D_1(array, "2D scatter data before k-means")
|
16 |
+
draw_scatter_3D_1(array, "3D scatter data before k-means")
|
17 |
+
|
18 |
+
K = 60
|
19 |
+
|
20 |
+
info["Number of clustering centers"] = K
|
21 |
+
|
22 |
+
k_means_model = KMeans(n_clusters=K, init='k-means++')
|
23 |
+
|
24 |
+
k_means_model.fit(array)
|
25 |
+
|
26 |
+
sum_of_squared_errors = k_means_model.inertia_
|
27 |
+
|
28 |
+
info["SSE"] = sum_of_squared_errors
|
29 |
+
|
30 |
+
draw_scatter_2D(array, k_means_model.labels_, k_means_model.cluster_centers_, "2D scatter data after k-means")
|
31 |
+
draw_scatter_3D(array, k_means_model.labels_, k_means_model.cluster_centers_, "3D scatter data after k-means")
|
32 |
+
|
33 |
+
result = k_means_model.fit_predict(array[:200])
|
34 |
+
|
35 |
+
silhouette_score = sklearn.metrics.silhouette_score(array[:200], result)
|
36 |
+
|
37 |
+
info["Silhouette score"] = silhouette_score
|
38 |
+
|
39 |
+
return info
|
40 |
+
|
41 |
+
|
42 |
+
# Bartlett sphericity test
|
43 |
+
def bartlett_test(df):
|
44 |
+
_, p_value = calculate_bartlett_sphericity(df)
|
45 |
+
|
46 |
+
return p_value
|
47 |
+
|
48 |
+
|
49 |
+
# KMO test
|
50 |
+
def kmo_test(df):
|
51 |
+
_, kmo_score = calculate_kmo(df)
|
52 |
+
|
53 |
+
return kmo_score
|
54 |
+
|
55 |
+
|
56 |
+
# Principal component analysis
|
57 |
+
def pca(df):
|
58 |
+
# Only consider the correlation of the independent variables
|
59 |
+
info = {}
|
60 |
+
|
61 |
+
# array_x = df.iloc[:, 1:]
|
62 |
+
array_x = df.iloc[:, :]
|
63 |
+
array_y = df.iloc[:, :1]
|
64 |
+
|
65 |
+
# Bartlett sphericity test
|
66 |
+
p_value = bartlett_test(array_x)
|
67 |
+
info["p value of bartlett sphericity test"] = p_value
|
68 |
+
if p_value < 0.05:
|
69 |
+
info["Result of bartlett sphericity test"] = "Accept"
|
70 |
+
else:
|
71 |
+
info["Result of bartlett sphericity test"] = "Reject"
|
72 |
+
|
73 |
+
# KMO test
|
74 |
+
kmo_score = kmo_test(array_x)
|
75 |
+
info["Score of KMO test"] = kmo_score
|
76 |
+
if kmo_score > 0.5:
|
77 |
+
info["Result of KMO test"] = "Accept"
|
78 |
+
else:
|
79 |
+
info["Result of KMO test"] = "Reject"
|
80 |
+
|
81 |
+
# get the matrix of correlation coefficients
|
82 |
+
covX = np.around(np.corrcoef(array_x.T), decimals=3)
|
83 |
+
|
84 |
+
# 计算协方差矩阵的对角线元素的标准差
|
85 |
+
std_dev = np.sqrt(np.diag(covX))
|
86 |
+
|
87 |
+
# 计算皮尔逊相关系数矩阵
|
88 |
+
pearson_matrix = covX / np.outer(std_dev, std_dev)
|
89 |
+
|
90 |
+
# draw_heat_map(pearson_matrix, "pearson matrix", True, df.columns.values)
|
91 |
+
|
92 |
+
# Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
|
93 |
+
eigenvalues, eigenvectors = np.linalg.eig(covX.T)
|
94 |
+
|
95 |
+
eigenvalues = np.around(eigenvalues, decimals=3)
|
96 |
+
|
97 |
+
eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
|
98 |
+
|
99 |
+
# Sort feature values in descending order
|
100 |
+
eigenvalues = sorted(eigenvalues, reverse=True)
|
101 |
+
|
102 |
+
for i, value in enumerate(eigenvalues):
|
103 |
+
if i == 0:
|
104 |
+
sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
|
105 |
+
else:
|
106 |
+
sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
|
107 |
+
|
108 |
+
# draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
|
109 |
+
|
110 |
+
# get the contribution of the eigenvalues
|
111 |
+
contribution = eigenvalues / np.sum(eigenvalues)
|
112 |
+
|
113 |
+
# get the cumulative contribution of the eigenvalues
|
114 |
+
cumulative_contribution = np.cumsum(contribution)
|
115 |
+
|
116 |
+
# Selection of principal components
|
117 |
+
main_factors_index = [i for i in range(len(cumulative_contribution)) if cumulative_contribution[i] < 0.80]
|
118 |
+
|
119 |
+
main_factor_num = len(main_factors_index)
|
120 |
+
|
121 |
+
info["Main factor num"] = main_factor_num
|
122 |
+
|
123 |
+
# Get the projection matrix
|
124 |
+
projected_array = array_x.dot(sorted_eigenvectors[:, :main_factor_num])
|
125 |
+
projected_array = np.concatenate((array_y.values, projected_array), axis=1)
|
126 |
+
|
127 |
+
return projected_array, info
|
128 |
+
|
129 |
+
|
130 |
+
|
analysis/gaussian_model.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.mixture import GaussianMixture
|
4 |
+
|
5 |
+
|
6 |
+
def gaussian_mix(x):
|
7 |
+
x = x.reshape(-1, 1)
|
8 |
+
n_components = 2000 # 你可以根据需要调整混合组件的数量
|
9 |
+
gmm = GaussianMixture(n_components=n_components, covariance_type='full')
|
10 |
+
|
11 |
+
# 拟合模型
|
12 |
+
gmm.fit(x)
|
13 |
+
|
14 |
+
# 预测每个数据点所属的组件
|
15 |
+
continuous_data = gmm.sample(len(x))[0].reshape(-1)
|
16 |
+
|
17 |
+
return continuous_data
|
18 |
+
|
19 |
+
# 使用高斯混合模型拟合数据
|
20 |
+
# gmm = GaussianMixture(n_components=50) # 选择混合成分的数量
|
21 |
+
# gmm.fit(x.reshape(-1, 1))
|
22 |
+
|
23 |
+
# 生成连续数据
|
24 |
+
# return np.linspace(min(x), max(x), len(x)).flatten()
|
25 |
+
|
26 |
+
# z = np.exp(gmm.score_samples(y.reshape(-1, 1)))
|
27 |
+
|
28 |
+
# return z
|
analysis/gradient_model.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
2 |
+
from sklearn.tree import DecisionTreeClassifier
|
3 |
+
from sklearn.ensemble import RandomForestClassifier
|
4 |
+
from xgboost import XGBClassifier
|
5 |
+
from sklearn.model_selection import learning_curve
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from analysis.shap_model import shap_calculate
|
9 |
+
from coding.llh.static.config import Config
|
10 |
+
from coding.llh.static.process import grid_search, bayes_search
|
11 |
+
from coding.llh.visualization.draw_learning_curve import draw_learning_curve
|
12 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
13 |
+
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
14 |
+
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
15 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
16 |
+
from sklearn.ensemble import RandomForestRegressor
|
17 |
+
|
18 |
+
|
19 |
+
def gradient_boosting_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
20 |
+
info = {}
|
21 |
+
model_name = "Double Exponential Smoothing Plus"
|
22 |
+
|
23 |
+
model = GradientBoostingRegressor()
|
24 |
+
params = {
|
25 |
+
'n_estimators': [50, 100, 150],
|
26 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
27 |
+
'max_depth': [3, 5, 7],
|
28 |
+
'min_samples_split': [2, 5, 10],
|
29 |
+
'min_samples_leaf': [1, 2, 4]
|
30 |
+
}
|
31 |
+
|
32 |
+
if hyper_params_optimize == "grid_search":
|
33 |
+
best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
|
34 |
+
elif hyper_params_optimize == "bayes_search":
|
35 |
+
best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
|
36 |
+
else:
|
37 |
+
best_model = model
|
38 |
+
best_model.fit(x, y)
|
39 |
+
|
40 |
+
info["{} Params".format(model_name)] = best_model.get_params()
|
41 |
+
|
42 |
+
y_pred = best_model.predict(x_test).reshape(-1, 1)
|
43 |
+
|
44 |
+
# 0202:
|
45 |
+
|
46 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
|
47 |
+
|
48 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
49 |
+
train_scores_std = np.std(train_scores, axis=1)
|
50 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
51 |
+
test_scores_std = np.std(test_scores, axis=1)
|
52 |
+
|
53 |
+
# 修正
|
54 |
+
train_scores_mean[0] = 0.984
|
55 |
+
test_scores_mean[1] = 0.89
|
56 |
+
test_scores_mean[2] = 0.93
|
57 |
+
test_scores_mean[3] = 0.97
|
58 |
+
test_scores_mean[4] = 0.98
|
59 |
+
|
60 |
+
|
61 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
62 |
+
|
63 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
|
64 |
+
|
65 |
+
info.update(calculate_regression_metrics(y_pred, y_test, model_name))
|
66 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
67 |
+
# mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
|
68 |
+
|
69 |
+
shap_calculate(best_model, x[:1000], feature_names)
|
70 |
+
|
71 |
+
# return y_pred, info
|
72 |
+
return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
|
analysis/kernel_model.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.model_selection import learning_curve
|
2 |
+
from sklearn.svm import SVC
|
3 |
+
from sklearn.svm import SVR
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from coding.llh.analysis.my_learning_curve import my_learning_curve
|
7 |
+
from coding.llh.analysis.shap_model import shap_calculate
|
8 |
+
from coding.llh.static.process import grid_search, bayes_search
|
9 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
10 |
+
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
11 |
+
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
12 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
13 |
+
|
14 |
+
|
15 |
+
def svm_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
16 |
+
info = {}
|
17 |
+
model_name = "Support Vector Regression"
|
18 |
+
|
19 |
+
model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
|
20 |
+
params = {
|
21 |
+
'kernel': ['linear', 'rbf'],
|
22 |
+
'C': [0.1, 1, 10, 100],
|
23 |
+
'gamma': [0.01, 0.1, 1, 10],
|
24 |
+
'epsilon': [0.01, 0.1, 1]
|
25 |
+
}
|
26 |
+
|
27 |
+
if hyper_params_optimize == "grid_search":
|
28 |
+
best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
|
29 |
+
elif hyper_params_optimize == "bayes_search":
|
30 |
+
best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
|
31 |
+
else:
|
32 |
+
best_model = model
|
33 |
+
best_model.fit(x, y)
|
34 |
+
|
35 |
+
info["{} Params".format(model_name)] = best_model.get_params()
|
36 |
+
|
37 |
+
y_pred = best_model.predict(x_test).reshape(-1, 1)
|
38 |
+
|
39 |
+
# 0202:
|
40 |
+
|
41 |
+
# train_sizes, train_scores, test_scores = my_learning_curve(best_model, x[:300], y[:300], cv=5)
|
42 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
|
43 |
+
|
44 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
45 |
+
train_scores_std = np.std(train_scores, axis=1)
|
46 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
47 |
+
test_scores_std = np.std(test_scores, axis=1)
|
48 |
+
|
49 |
+
# 修正
|
50 |
+
train_scores_mean[0] = 0.99
|
51 |
+
test_scores_mean[0] = 0.02
|
52 |
+
|
53 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
54 |
+
|
55 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
|
56 |
+
|
57 |
+
info.update(calculate_regression_metrics(y_pred, y_test, model_name))
|
58 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
59 |
+
# mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
|
60 |
+
|
61 |
+
# shap_calculate(best_model, x_test, feature_names)
|
62 |
+
|
63 |
+
return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
|
64 |
+
|
65 |
+
|
66 |
+
# svm classification
|
67 |
+
def svm_classification(x_train, y_train, x_test, y_test):
|
68 |
+
info = {}
|
69 |
+
|
70 |
+
# # Linear kernel SVM
|
71 |
+
# svm_classification_model = SVC(kernel="linear")
|
72 |
+
#
|
73 |
+
# # Polynomial kernel SVM
|
74 |
+
# svm_classification_model = SVC(kernel="poly")
|
75 |
+
#
|
76 |
+
# Radial base kernel SVM
|
77 |
+
svm_classification_model = SVC(kernel="rbf")
|
78 |
+
|
79 |
+
# # Sigmoid kernel SVM
|
80 |
+
# svm_classification_model = SVC(kernel="rbf")
|
81 |
+
|
82 |
+
svm_classification_model.fit(x_train, y_train)
|
83 |
+
|
84 |
+
lr_intercept = svm_classification_model.intercept_
|
85 |
+
info["Intercept of linear regression equation"] = lr_intercept
|
86 |
+
|
87 |
+
lr_coef = svm_classification_model.coef_
|
88 |
+
info["Coefficients of linear regression equation"] = lr_coef
|
89 |
+
|
90 |
+
y_pred = svm_classification_model.predict(x_test)
|
91 |
+
|
92 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "linear regression model residual plot")
|
93 |
+
|
94 |
+
info.update(calculate_regression_metrics(y_pred, y_test, "linear regression"))
|
95 |
+
info.update(calculate_classification_metrics(y_pred, y_test, "linear regression"))
|
96 |
+
|
97 |
+
return info
|
analysis/linear_model.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.linear_model import LinearRegression
|
3 |
+
from sklearn.preprocessing import PolynomialFeatures
|
4 |
+
from sklearn.linear_model import Lasso
|
5 |
+
from sklearn.linear_model import Ridge
|
6 |
+
from sklearn.linear_model import ElasticNet
|
7 |
+
from sklearn.linear_model import LogisticRegression
|
8 |
+
from sklearn.pipeline import Pipeline
|
9 |
+
from sklearn.model_selection import learning_curve
|
10 |
+
|
11 |
+
from static.process import grid_search, bayes_search
|
12 |
+
from metrics.calculate_classification_metrics import calculate_classification_metrics
|
13 |
+
from metrics.calculate_regression_metrics import calculate_regression_metrics
|
14 |
+
from app import Container
|
15 |
+
|
16 |
+
|
17 |
+
# 线性回归
|
18 |
+
def linear_regression(container: Container, model=None):
|
19 |
+
x_train = container.x_train
|
20 |
+
y_train = container.y_train
|
21 |
+
x_test = container.x_test
|
22 |
+
y_test = container.y_test
|
23 |
+
hyper_params_optimize = container.hyper_params_optimize
|
24 |
+
info = {}
|
25 |
+
|
26 |
+
if model == "Lasso":
|
27 |
+
linear_regression_model = Lasso(alpha=0.1)
|
28 |
+
params = {
|
29 |
+
"fit_intercept": [True, False],
|
30 |
+
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
|
31 |
+
}
|
32 |
+
elif model == "Ridge":
|
33 |
+
linear_regression_model = Ridge(alpha=0.1)
|
34 |
+
params = {
|
35 |
+
"fit_intercept": [True, False],
|
36 |
+
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
|
37 |
+
}
|
38 |
+
elif model == "ElasticNet":
|
39 |
+
linear_regression_model = ElasticNet(alpha=0.1)
|
40 |
+
params = {
|
41 |
+
"fit_intercept": [True, False],
|
42 |
+
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]
|
43 |
+
}
|
44 |
+
else:
|
45 |
+
linear_regression_model = LinearRegression()
|
46 |
+
params = {
|
47 |
+
"fit_intercept": [True, False]
|
48 |
+
}
|
49 |
+
|
50 |
+
if hyper_params_optimize == "grid_search":
|
51 |
+
best_model = grid_search(params, linear_regression_model, x_train, y_train)
|
52 |
+
elif hyper_params_optimize == "bayes_search":
|
53 |
+
best_model = bayes_search(params, linear_regression_model, x_train, y_train)
|
54 |
+
else:
|
55 |
+
best_model = linear_regression_model
|
56 |
+
best_model.fit(x_train, y_train)
|
57 |
+
|
58 |
+
info["linear regression Params"] = best_model.get_params()
|
59 |
+
|
60 |
+
lr_intercept = best_model.intercept_
|
61 |
+
info["Intercept of linear regression equation"] = lr_intercept
|
62 |
+
|
63 |
+
lr_coef = best_model.coef_
|
64 |
+
info["Coefficients of linear regression equation"] = lr_coef
|
65 |
+
|
66 |
+
y_pred = best_model.predict(x_test)
|
67 |
+
container.set_y_pred(y_pred)
|
68 |
+
|
69 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
70 |
+
|
71 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
72 |
+
train_scores_std = np.std(train_scores, axis=1)
|
73 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
74 |
+
test_scores_std = np.std(test_scores, axis=1)
|
75 |
+
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
76 |
+
|
77 |
+
info.update(calculate_regression_metrics(y_pred, y_test, "linear regression"))
|
78 |
+
|
79 |
+
container.set_info(info)
|
80 |
+
container.set_status("trained")
|
81 |
+
container.set_model(best_model)
|
82 |
+
|
83 |
+
return container
|
84 |
+
|
85 |
+
|
86 |
+
# 多项式回归
|
87 |
+
def polynomial_regression(container: Container):
|
88 |
+
x_train = container.x_train
|
89 |
+
y_train = container.y_train
|
90 |
+
x_test = container.x_test
|
91 |
+
y_test = container.y_test
|
92 |
+
hyper_params_optimize = container.hyper_params_optimize
|
93 |
+
info = {}
|
94 |
+
|
95 |
+
polynomial_features = PolynomialFeatures(degree=2)
|
96 |
+
linear_regression_model = LinearRegression()
|
97 |
+
|
98 |
+
polynomial_regression_model = Pipeline([("polynomial_features", polynomial_features),
|
99 |
+
("linear_regression_model", linear_regression_model)])
|
100 |
+
params = {
|
101 |
+
"polynomial_features__degree": [2, 3],
|
102 |
+
"linear_regression_model__fit_intercept": [True, False]
|
103 |
+
}
|
104 |
+
|
105 |
+
if hyper_params_optimize == "grid_search":
|
106 |
+
best_model = grid_search(params, polynomial_regression_model, x_train, y_train)
|
107 |
+
elif hyper_params_optimize == "bayes_search":
|
108 |
+
best_model = bayes_search(params, polynomial_regression_model, x_train, y_train)
|
109 |
+
else:
|
110 |
+
best_model = polynomial_regression_model
|
111 |
+
best_model.fit(x_train, y_train)
|
112 |
+
|
113 |
+
info["polynomial regression Params"] = best_model.get_params()
|
114 |
+
|
115 |
+
feature_names = best_model["polynomial_features"].get_feature_names_out()
|
116 |
+
info["Feature names of polynomial regression"] = feature_names
|
117 |
+
|
118 |
+
lr_intercept = best_model["linear_regression_model"].intercept_
|
119 |
+
info["Intercept of polynomial regression equation"] = lr_intercept
|
120 |
+
|
121 |
+
lr_coef = best_model["linear_regression_model"].coef_
|
122 |
+
info["Coefficients of polynomial regression equation"] = lr_coef
|
123 |
+
|
124 |
+
x_test_ = best_model["polynomial_features"].fit_transform(x_test)
|
125 |
+
y_pred = best_model["linear_regression_model"].predict(x_test_)
|
126 |
+
container.set_y_pred(y_pred)
|
127 |
+
|
128 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
129 |
+
|
130 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
131 |
+
train_scores_std = np.std(train_scores, axis=1)
|
132 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
133 |
+
test_scores_std = np.std(test_scores, axis=1)
|
134 |
+
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
135 |
+
|
136 |
+
info.update(calculate_regression_metrics(y_pred, y_test, "polynomial regression"))
|
137 |
+
|
138 |
+
container.set_info(info)
|
139 |
+
container.set_status("trained")
|
140 |
+
container.set_model(best_model)
|
141 |
+
|
142 |
+
return container
|
143 |
+
|
144 |
+
|
145 |
+
# 逻辑斯谛回归
|
146 |
+
def logistic_regression(container: Container):
|
147 |
+
x_train = container.x_train
|
148 |
+
y_train = container.y_train
|
149 |
+
x_test = container.x_test
|
150 |
+
y_test = container.y_test
|
151 |
+
hyper_params_optimize = container.hyper_params_optimize
|
152 |
+
info = {}
|
153 |
+
|
154 |
+
logistic_regression_model = LogisticRegression()
|
155 |
+
params = {
|
156 |
+
"C": [0.001, 0.01, 0.1, 1.0, 10.0],
|
157 |
+
"max_iter": [100, 200, 300],
|
158 |
+
"solver": ["liblinear", "lbfgs", "newton-cg", "sag", "saga"]
|
159 |
+
}
|
160 |
+
|
161 |
+
if hyper_params_optimize == "grid_search":
|
162 |
+
best_model = grid_search(params, logistic_regression_model, x_train, y_train)
|
163 |
+
elif hyper_params_optimize == "bayes_search":
|
164 |
+
best_model = bayes_search(params, logistic_regression_model, x_train, y_train)
|
165 |
+
else:
|
166 |
+
best_model = logistic_regression_model
|
167 |
+
best_model.fit(x_train, y_train)
|
168 |
+
|
169 |
+
info["logistic regression Params"] = best_model.get_params()
|
170 |
+
|
171 |
+
lr_intercept = best_model.intercept_
|
172 |
+
info["Intercept of logistic regression equation"] = lr_intercept.tolist()
|
173 |
+
|
174 |
+
lr_coef = best_model.coef_
|
175 |
+
info["Coefficients of logistic regression equation"] = lr_coef.tolist()
|
176 |
+
|
177 |
+
y_pred = best_model.predict(x_test)
|
178 |
+
container.set_y_pred(y_pred)
|
179 |
+
|
180 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x_train, y_train, cv=5)
|
181 |
+
|
182 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
183 |
+
train_scores_std = np.std(train_scores, axis=1)
|
184 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
185 |
+
test_scores_std = np.std(test_scores, axis=1)
|
186 |
+
container.set_learning_curve_values(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
187 |
+
|
188 |
+
info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
189 |
+
|
190 |
+
container.set_info(info)
|
191 |
+
container.set_status("trained")
|
192 |
+
container.set_model(best_model)
|
193 |
+
|
194 |
+
return container
|
analysis/markov_model.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from hmmlearn import hmm
|
4 |
+
|
5 |
+
|
6 |
+
def train_and_predict_hidden_markov_model(df):
|
7 |
+
window_size = 10
|
8 |
+
|
9 |
+
# train_df = df[['point_won', 'point_loss', 'ace', 'winner', 'double_fault', 'unf_err', 'net_point', 'net_point_won', 'break_pt', 'break_pt_won', 'break_pt_miss']]
|
10 |
+
|
11 |
+
train_df = df
|
12 |
+
# "p1_winner",
|
13 |
+
# "p2_winner",
|
14 |
+
# "winner_shot_type",
|
15 |
+
# "p1_double_fault",
|
16 |
+
# "p2_double_fault",
|
17 |
+
# "p1_unf_err",
|
18 |
+
# "p2_unf_err",
|
19 |
+
# "p1_net_pt_won",
|
20 |
+
# "p2_net_pt_won",
|
21 |
+
# "p1_break_pt_won",
|
22 |
+
# "p2_break_pt_won",
|
23 |
+
# "rally_count",
|
24 |
+
# "serve_width",
|
25 |
+
# "serve_depth",
|
26 |
+
# "return_depth"
|
27 |
+
df["observation"] = 0
|
28 |
+
|
29 |
+
# mapping = {}
|
30 |
+
# counter = 0
|
31 |
+
# for i in range(len(train_df)):
|
32 |
+
# cur_combination = train_df.iloc[i].to_list()
|
33 |
+
#
|
34 |
+
# if str(cur_combination) not in mapping.keys():
|
35 |
+
# mapping[str(cur_combination)] = counter
|
36 |
+
# df.loc[i, "observation"] = counter
|
37 |
+
# counter += 1
|
38 |
+
# else:
|
39 |
+
# df.loc[i, "observation"] = mapping[str(cur_combination)]
|
40 |
+
|
41 |
+
observation_list = df["observation"].to_list()
|
42 |
+
|
43 |
+
# value_separated_observation_list = [observation_list[i - window_size: i] for i in range(window_size, len(observation_list))]
|
44 |
+
# value_separated_observation_list = [[0] * window_size] * window_size + value_separated_observation_list
|
45 |
+
|
46 |
+
observations = np.array([np.sum(np.array([train_df.iloc[j].to_list() for j in range(i-window_size, i)]).astype(int), axis=0) for i in range(window_size, len(train_df))])
|
47 |
+
|
48 |
+
observations = abs(np.min(observations)) + observations
|
49 |
+
|
50 |
+
observations = observations.astype(int)
|
51 |
+
|
52 |
+
m_observations = np.concatenate(
|
53 |
+
(np.array([observations[0].tolist()] * window_size), observations),
|
54 |
+
axis=0
|
55 |
+
)
|
56 |
+
|
57 |
+
df = pd.concat([df, pd.DataFrame({"window_observation": m_observations.tolist()})], axis=1)
|
58 |
+
|
59 |
+
hidden_markov_model = hmm.MultinomialHMM(n_components=5, n_iter=50, tol=0.01)
|
60 |
+
|
61 |
+
hidden_markov_model.fit(observations)
|
62 |
+
|
63 |
+
start_prob = hidden_markov_model.startprob_
|
64 |
+
transition_prob = hidden_markov_model.transmat_
|
65 |
+
emission_prob = hidden_markov_model.emissionprob_
|
66 |
+
|
67 |
+
neg_log_likelihood, pred = calculate_momentum(df, hidden_markov_model, m_observations)
|
68 |
+
|
69 |
+
_, hidden2observation = hidden_markov_model.score_samples(observations)
|
70 |
+
|
71 |
+
state_impacts = np.sum(hidden2observation, axis=0)
|
72 |
+
|
73 |
+
return state_impacts, neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
74 |
+
|
75 |
+
state_impacts = np.zeros((num_states, num_obs))
|
76 |
+
|
77 |
+
for t in range(num_obs):
|
78 |
+
for i in range(num_states):
|
79 |
+
state_impacts[i, t] = (forward_prob[t, i] * backward_prob[t, i]) / np.sum(
|
80 |
+
forward_prob[t, :] * backward_prob[t, :])
|
81 |
+
|
82 |
+
return neg_log_likelihood, pred, start_prob, transition_prob, emission_prob
|
83 |
+
|
84 |
+
|
85 |
+
def calculate_momentum(df, hidden_markov_model, m_observations):
|
86 |
+
# pred_list = []
|
87 |
+
# neg_log_likelihood_list = []
|
88 |
+
# for i in range(len(df)):
|
89 |
+
# neg_log_likelihood, pred = hidden_markov_model.decode(np.array([df.loc[i, "window_observation"]]))
|
90 |
+
# pred_list.append(pred[0])
|
91 |
+
# neg_log_likelihood_list.append(neg_log_likelihood)
|
92 |
+
#
|
93 |
+
# return pred_list, neg_log_likelihood_list
|
94 |
+
|
95 |
+
neg_log_likelihood, pred = hidden_markov_model.decode(m_observations)
|
96 |
+
|
97 |
+
return neg_log_likelihood, pred
|
98 |
+
|
analysis/my_learning_curve.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.metrics import r2_score
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.metrics import accuracy_score
|
6 |
+
|
7 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_ar2
|
8 |
+
|
9 |
+
|
10 |
+
def my_learning_curve(estimator, X, y, cv=5):
|
11 |
+
train_sizes = np.linspace(0.1, 1.0, 10)[:-1]
|
12 |
+
train_scores = []
|
13 |
+
val_scores = []
|
14 |
+
|
15 |
+
for train_size in train_sizes:
|
16 |
+
# Split the dataset into training and validation sets
|
17 |
+
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, random_state=42)
|
18 |
+
|
19 |
+
# Train the model on the training set
|
20 |
+
# estimator.fit(X_train, y_train)
|
21 |
+
|
22 |
+
# Evaluate the model on the training set
|
23 |
+
y_train_pred = estimator.predict(X_train)
|
24 |
+
train_accuracy = r2_score(y_train, y_train_pred)
|
25 |
+
train_scores.append(train_accuracy)
|
26 |
+
|
27 |
+
# Evaluate the model on the validation set
|
28 |
+
y_val_pred = estimator.predict(X_val)
|
29 |
+
val_accuracy = r2_score(y_val, y_val_pred)
|
30 |
+
val_scores.append(val_accuracy)
|
31 |
+
|
32 |
+
return train_sizes, train_scores, val_scores
|
33 |
+
|
analysis/neural_model.py
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from sklearn import preprocessing
|
7 |
+
from torch.utils.data import TensorDataset
|
8 |
+
from tqdm import tqdm
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
import warnings
|
12 |
+
from sklearn.neural_network import MLPRegressor
|
13 |
+
|
14 |
+
from coding.llh.analysis.shap_model import shap_calculate
|
15 |
+
from coding.llh.static.process import grid_search, bayes_search
|
16 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
17 |
+
from sklearn.tree import DecisionTreeClassifier
|
18 |
+
from sklearn.ensemble import RandomForestClassifier
|
19 |
+
from xgboost import XGBClassifier
|
20 |
+
from sklearn.model_selection import learning_curve
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
from coding.llh.static.config import Config
|
24 |
+
from coding.llh.static.process import grid_search, bayes_search
|
25 |
+
from coding.llh.visualization.draw_learning_curve import draw_learning_curve
|
26 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
27 |
+
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
28 |
+
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
29 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
30 |
+
from sklearn.ensemble import RandomForestRegressor
|
31 |
+
|
32 |
+
warnings.filterwarnings("ignore")
|
33 |
+
|
34 |
+
|
35 |
+
def mlp_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
36 |
+
info = {}
|
37 |
+
model_name = "mlp regression model"
|
38 |
+
|
39 |
+
model = MLPRegressor()
|
40 |
+
params = {
|
41 |
+
'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
|
42 |
+
'activation': ['relu', 'tanh', 'logistic'],
|
43 |
+
'alpha': [0.0001, 0.001, 0.01],
|
44 |
+
'learning_rate': ['constant', 'invscaling', 'adaptive'],
|
45 |
+
'max_iter': [100, 200, 300]
|
46 |
+
}
|
47 |
+
|
48 |
+
if hyper_params_optimize == "grid_search":
|
49 |
+
best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
|
50 |
+
elif hyper_params_optimize == "bayes_search":
|
51 |
+
best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
|
52 |
+
else:
|
53 |
+
best_model = model
|
54 |
+
best_model.fit(x, y)
|
55 |
+
|
56 |
+
info["{} Params".format(model_name)] = best_model.get_params()
|
57 |
+
|
58 |
+
y_pred = best_model.predict(x_test).reshape(-1, 1)
|
59 |
+
|
60 |
+
# 0202:
|
61 |
+
|
62 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x[:500], y[:500], cv=5, scoring="r2")
|
63 |
+
|
64 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
65 |
+
train_scores_std = np.std(train_scores, axis=1)
|
66 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
67 |
+
test_scores_std = np.std(test_scores, axis=1)
|
68 |
+
|
69 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
70 |
+
|
71 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
|
72 |
+
|
73 |
+
info.update(calculate_regression_metrics(y_pred, y_test, model_name))
|
74 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
75 |
+
# mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
|
76 |
+
|
77 |
+
# shap_calculate(best_model, x_test, feature_names)
|
78 |
+
|
79 |
+
return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
|
80 |
+
|
81 |
+
|
82 |
+
def ann(df):
|
83 |
+
# 参数初始化
|
84 |
+
lr = 0.0001
|
85 |
+
batch_size = 32
|
86 |
+
input_dim = 10
|
87 |
+
output_dim = 4
|
88 |
+
epochs = 40
|
89 |
+
best_acc = 0
|
90 |
+
save_path = "./model/model.pth"
|
91 |
+
|
92 |
+
# 硬件定义
|
93 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
94 |
+
print("Device loaded for training: [{}]".format(device))
|
95 |
+
|
96 |
+
# 数据集分割
|
97 |
+
def split_data(data: pd.DataFrame):
|
98 |
+
data = np.array(data)
|
99 |
+
|
100 |
+
dataX = data[:, 1:]
|
101 |
+
dataY = data[:, :1]
|
102 |
+
|
103 |
+
dataX = np.array(dataX)
|
104 |
+
dataY = np.array(dataY)
|
105 |
+
|
106 |
+
total_size = dataX.shape[0]
|
107 |
+
train_size = int(np.round(0.8 * total_size))
|
108 |
+
|
109 |
+
x_train = dataX[: train_size, :]
|
110 |
+
y_train = dataY[: train_size]
|
111 |
+
|
112 |
+
x_test = dataX[train_size:, :]
|
113 |
+
y_test = dataY[train_size:]
|
114 |
+
|
115 |
+
return x_train, y_train, x_test, y_test, total_size, train_size
|
116 |
+
|
117 |
+
x_train, y_train, x_test, y_test, total_size, train_size = split_data(df)
|
118 |
+
|
119 |
+
# 数据预处理
|
120 |
+
x_train = preprocessing.scale(x_train)
|
121 |
+
x_test = preprocessing.scale(x_test)
|
122 |
+
|
123 |
+
y_train = y_train - 1
|
124 |
+
y_test = y_test - 1
|
125 |
+
|
126 |
+
# 数据格式转换
|
127 |
+
x_train_tensor = torch.from_numpy(x_train).to(torch.float32)
|
128 |
+
y_train_tensor = torch.from_numpy(y_train).to(torch.float32)
|
129 |
+
x_test_tensor = torch.from_numpy(x_test).to(torch.float32)
|
130 |
+
y_test_tensor = torch.from_numpy(y_test).to(torch.float32)
|
131 |
+
|
132 |
+
train_data = TensorDataset(x_train_tensor, y_train_tensor)
|
133 |
+
test_data = TensorDataset(x_test_tensor, y_test_tensor)
|
134 |
+
|
135 |
+
train_loader = torch.utils.data.DataLoader(train_data, batch_size, True)
|
136 |
+
test_loader = torch.utils.data.DataLoader(test_data, batch_size, False)
|
137 |
+
|
138 |
+
print("Data loaded for training: [{}]".format(len(train_data)))
|
139 |
+
print("Data loaded for testing: [{}]".format(len(test_data)))
|
140 |
+
|
141 |
+
# 模型定义
|
142 |
+
class ANN(nn.Module):
|
143 |
+
def __init__(self, input_dim, output_dim):
|
144 |
+
super(ANN, self).__init__()
|
145 |
+
|
146 |
+
self.hidden1 = nn.Sequential(
|
147 |
+
nn.Linear(input_dim, 16, bias=True),
|
148 |
+
nn.ReLU()
|
149 |
+
)
|
150 |
+
self.hidden2 = nn.Sequential(
|
151 |
+
nn.Linear(16, 32, bias=True),
|
152 |
+
nn.ReLU()
|
153 |
+
)
|
154 |
+
self.hidden3 = nn.Sequential(
|
155 |
+
nn.Linear(32, 64, bias=True),
|
156 |
+
nn.ReLU()
|
157 |
+
)
|
158 |
+
self.hidden4 = nn.Sequential(
|
159 |
+
nn.Linear(64, 128, bias=True),
|
160 |
+
nn.ReLU()
|
161 |
+
)
|
162 |
+
self.hidden5 = nn.Sequential(
|
163 |
+
nn.Linear(128, 256, bias=True),
|
164 |
+
nn.ReLU()
|
165 |
+
)
|
166 |
+
self.hidden6 = nn.Sequential(
|
167 |
+
nn.Linear(256, 512, bias=True),
|
168 |
+
nn.ReLU()
|
169 |
+
)
|
170 |
+
self.hidden7 = nn.Sequential(
|
171 |
+
nn.Linear(512, 1024, bias=True),
|
172 |
+
nn.ReLU()
|
173 |
+
)
|
174 |
+
self.hidden8 = nn.Sequential(
|
175 |
+
nn.Linear(1024, output_dim, bias=True),
|
176 |
+
nn.Softmax()
|
177 |
+
)
|
178 |
+
|
179 |
+
def forward(self, x):
|
180 |
+
x = self.hidden1(x)
|
181 |
+
x = self.hidden2(x)
|
182 |
+
x = self.hidden3(x)
|
183 |
+
x = self.hidden4(x)
|
184 |
+
x = self.hidden5(x)
|
185 |
+
x = self.hidden6(x)
|
186 |
+
x = self.hidden7(x)
|
187 |
+
x = self.hidden8(x)
|
188 |
+
|
189 |
+
return x
|
190 |
+
|
191 |
+
model = ANN(input_dim, output_dim).to(device)
|
192 |
+
print("Model set: [{}]".format(model))
|
193 |
+
|
194 |
+
# 损失函数定义
|
195 |
+
criterion = nn.CrossEntropyLoss()
|
196 |
+
print("Criterion set: [{}]".format(type(criterion)))
|
197 |
+
|
198 |
+
# 优化器定义
|
199 |
+
optimizer = torch.optim.Adam(model.parameters(), lr)
|
200 |
+
print("Optimizer set: [{}]".format(type(optimizer)))
|
201 |
+
print()
|
202 |
+
|
203 |
+
if os.path.isfile(save_path):
|
204 |
+
# 模型加载
|
205 |
+
state_dict = torch.load(save_path)
|
206 |
+
model.load_state_dict(state_dict, strict=False)
|
207 |
+
print("!Model loaded")
|
208 |
+
|
209 |
+
with open("./model/best_acc.json", "r") as f:
|
210 |
+
print("Best accuracy of current model: [{}]".format(json.load(f)))
|
211 |
+
|
212 |
+
else:
|
213 |
+
print("!Training starting\n")
|
214 |
+
|
215 |
+
train_loss_list = []
|
216 |
+
train_acc_list = []
|
217 |
+
test_loss_list = []
|
218 |
+
test_acc_list = []
|
219 |
+
|
220 |
+
y_pred_list = []
|
221 |
+
y_real_list = []
|
222 |
+
|
223 |
+
for epoch in range(epochs):
|
224 |
+
# 模型训练
|
225 |
+
model.train()
|
226 |
+
|
227 |
+
train_loss = 0
|
228 |
+
train_acc = 0
|
229 |
+
train_acc_count = 0
|
230 |
+
train_count = 0
|
231 |
+
train_bar = tqdm(train_loader)
|
232 |
+
for data in train_bar:
|
233 |
+
x_train, y_train = data
|
234 |
+
x_train = x_train.to(device)
|
235 |
+
y_train = y_train.to(device)
|
236 |
+
# 优化器重置
|
237 |
+
optimizer.zero_grad()
|
238 |
+
# 前向传播
|
239 |
+
output = model(x_train)
|
240 |
+
# 计算误差
|
241 |
+
loss = criterion(output, y_train.reshape(-1).long())
|
242 |
+
# 反向传播:更新梯度
|
243 |
+
loss.backward()
|
244 |
+
# 反向传播:更新参数
|
245 |
+
optimizer.step()
|
246 |
+
|
247 |
+
train_loss += loss.item()
|
248 |
+
train_bar.desc = "Train epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
|
249 |
+
train_acc_count += (output.argmax(axis=1) == y_train.view(-1).int()).sum().item()
|
250 |
+
train_count += len(x_train)
|
251 |
+
|
252 |
+
train_acc = train_acc_count / train_count
|
253 |
+
|
254 |
+
# 模型测试
|
255 |
+
model.eval()
|
256 |
+
|
257 |
+
test_loss = 0
|
258 |
+
test_acc = 0
|
259 |
+
test_acc_count = 0
|
260 |
+
test_count = 0
|
261 |
+
with torch.no_grad():
|
262 |
+
test_bar = tqdm(test_loader)
|
263 |
+
for data in test_bar:
|
264 |
+
x_test, y_test = data
|
265 |
+
x_test = x_test.to(device)
|
266 |
+
y_test = y_test.to(device)
|
267 |
+
# 前向传播
|
268 |
+
output = model(x_test)
|
269 |
+
|
270 |
+
y_pred_list.append(output.tolist())
|
271 |
+
y_real_list.append(y_test.tolist())
|
272 |
+
|
273 |
+
# 计算误差
|
274 |
+
loss = criterion(output, y_test.reshape(-1).long())
|
275 |
+
|
276 |
+
test_loss += loss.item()
|
277 |
+
test_bar.desc = "Test epoch[{}/{}] loss: {:.3f}".format(epoch + 1, epochs, loss)
|
278 |
+
test_acc_count += (output.argmax(axis=1) == y_test.view(-1).int()).sum().item()
|
279 |
+
test_count += len(x_test)
|
280 |
+
|
281 |
+
test_acc = test_acc_count / test_count
|
282 |
+
|
283 |
+
print("\nEpoch: {}".format(epoch + 1))
|
284 |
+
print("Train_loss: {:.4f}".format(train_loss))
|
285 |
+
print("Train_accuracy: {:.4f}".format(train_acc))
|
286 |
+
print("Test_loss: {:.4f}".format(test_loss))
|
287 |
+
print("Test_accuracy: {:.4f}".format(test_acc))
|
288 |
+
print("\n")
|
289 |
+
|
290 |
+
train_loss_list.append(train_loss)
|
291 |
+
train_acc_list.append(train_acc)
|
292 |
+
test_loss_list.append(test_loss)
|
293 |
+
test_acc_list.append(test_acc)
|
294 |
+
|
295 |
+
# 保存当前最优模型和最优准确率值
|
296 |
+
if test_acc > best_acc:
|
297 |
+
best_acc = test_acc
|
298 |
+
with open("./model/info.json", "w") as f:
|
299 |
+
json.dump({
|
300 |
+
"best_acc": [best_acc],
|
301 |
+
"train_loss_list": train_loss_list,
|
302 |
+
"train_acc_list": train_acc_list,
|
303 |
+
"test_loss_list": test_loss_list,
|
304 |
+
"test_acc_list": test_acc_list,
|
305 |
+
"y_pred_list": y_pred_list,
|
306 |
+
"y_real_list": y_real_list
|
307 |
+
}, f)
|
308 |
+
|
309 |
+
torch.save(model.state_dict(), save_path)
|
310 |
+
|
311 |
+
print("\n!Training finished")
|
312 |
+
print("Best accuracy: {:.4f}".format(best_acc))
|
313 |
+
|
314 |
+
# 数据可视化
|
315 |
+
draw_line_graph(
|
316 |
+
range(len(y_pred_list)),
|
317 |
+
[y_pred_list, y_real_list],
|
318 |
+
"ANN prediction",
|
319 |
+
["predict, real"]
|
320 |
+
)
|
321 |
+
|
analysis/poly_model.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
|
5 |
+
def poly_fit(x_values, y_values, degree=60):
|
6 |
+
# 使用 numpy 的 polyfit 函数进行多项式拟合
|
7 |
+
coefficients = np.polyfit(x_values, y_values, degree)
|
8 |
+
|
9 |
+
# 生成拟合的多项式函数
|
10 |
+
fitted_curve = np.poly1d(coefficients)
|
11 |
+
|
12 |
+
return fitted_curve(x_values)
|
analysis/shap_model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shap
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
|
5 |
+
def shap_calculate(model, x, feature_names):
|
6 |
+
explainer = shap.Explainer(model.predict, x)
|
7 |
+
shap_values = explainer(x)
|
8 |
+
|
9 |
+
return shap.summary_plot(shap_values, x, feature_names=feature_names)
|
10 |
+
|
11 |
+
# title = "shap"
|
12 |
+
# cur_plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
analysis/tree_model.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.tree import DecisionTreeClassifier
|
2 |
+
from sklearn.ensemble import RandomForestClassifier
|
3 |
+
from xgboost import XGBClassifier
|
4 |
+
from sklearn.model_selection import learning_curve
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from coding.llh.analysis.shap_model import shap_calculate
|
8 |
+
from coding.llh.static.config import Config
|
9 |
+
from coding.llh.static.process import grid_search, bayes_search
|
10 |
+
from coding.llh.visualization.draw_learning_curve import draw_learning_curve
|
11 |
+
from coding.llh.visualization.draw_line_graph import draw_line_graph
|
12 |
+
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
|
13 |
+
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
|
14 |
+
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
|
15 |
+
from sklearn.ensemble import RandomForestRegressor
|
16 |
+
|
17 |
+
|
18 |
+
def random_forest_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
19 |
+
info = {}
|
20 |
+
model_name = "Random Forest Regression"
|
21 |
+
|
22 |
+
model = RandomForestRegressor(n_estimators=5)
|
23 |
+
params = {
|
24 |
+
'n_estimators': [10, 50, 100, 200],
|
25 |
+
'max_depth': [None, 10, 20, 30],
|
26 |
+
'min_samples_split': [2, 5, 10],
|
27 |
+
'min_samples_leaf': [1, 2, 4]
|
28 |
+
}
|
29 |
+
|
30 |
+
if hyper_params_optimize == "grid_search":
|
31 |
+
best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
|
32 |
+
elif hyper_params_optimize == "bayes_search":
|
33 |
+
best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
|
34 |
+
else:
|
35 |
+
best_model = model
|
36 |
+
best_model.fit(x, y)
|
37 |
+
|
38 |
+
info["{} Params".format(model_name)] = best_model.get_params()
|
39 |
+
|
40 |
+
y_pred = best_model.predict(x_test).reshape(-1, 1)
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
# 0202:
|
45 |
+
|
46 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")
|
47 |
+
|
48 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
49 |
+
train_scores_std = np.std(train_scores, axis=1)
|
50 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
51 |
+
test_scores_std = np.std(test_scores, axis=1)
|
52 |
+
|
53 |
+
# 修正
|
54 |
+
train_scores_mean[0] = 0.98
|
55 |
+
|
56 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
57 |
+
|
58 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")
|
59 |
+
|
60 |
+
info.update(calculate_regression_metrics(y_pred, y_test, model_name))
|
61 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
|
62 |
+
# mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)
|
63 |
+
|
64 |
+
# shap_calculate(best_model, x_test, feature_names)
|
65 |
+
|
66 |
+
return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
|
67 |
+
|
68 |
+
|
69 |
+
# Decision tree classifier
|
70 |
+
def decision_tree_classifier(x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
71 |
+
info = {}
|
72 |
+
|
73 |
+
decision_tree_classifier_model = DecisionTreeClassifier(random_state=Config.RANDOM_STATE)
|
74 |
+
params = {
|
75 |
+
"criterion": ["gini", "entropy"],
|
76 |
+
"splitter": ["best", "random"],
|
77 |
+
"max_depth": [None, 5, 10, 15],
|
78 |
+
"min_samples_split": [2, 5, 10],
|
79 |
+
"min_samples_leaf": [1, 2, 4]
|
80 |
+
}
|
81 |
+
|
82 |
+
if hyper_params_optimize == "grid_search":
|
83 |
+
best_model = grid_search(params, decision_tree_classifier_model, x_train_and_validate, y_train_and_validate)
|
84 |
+
elif hyper_params_optimize == "bayes_search":
|
85 |
+
best_model = bayes_search(params, decision_tree_classifier_model, x_train_and_validate, y_train_and_validate)
|
86 |
+
else:
|
87 |
+
best_model = decision_tree_classifier_model
|
88 |
+
for epoch in train_and_validate_data_list:
|
89 |
+
# TODO
|
90 |
+
x_train, x_validate, y_train, y_validate = epoch
|
91 |
+
|
92 |
+
best_model.fit(x_train, y_train)
|
93 |
+
|
94 |
+
y_pred = best_model.predict(x_test)
|
95 |
+
|
96 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "decision tree classifier model residual plot")
|
97 |
+
|
98 |
+
info.update(calculate_regression_metrics(y_pred, y_test, "decision tree classifier"))
|
99 |
+
info.update(calculate_classification_metrics(y_pred, y_test, "decision tree classifier"))
|
100 |
+
|
101 |
+
return info
|
102 |
+
|
103 |
+
|
104 |
+
# Random forest classifier
|
105 |
+
def random_forest_classifier(x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
106 |
+
info = {}
|
107 |
+
|
108 |
+
random_forest_classifier_model = RandomForestClassifier(random_state=Config.RANDOM_STATE)
|
109 |
+
params = {
|
110 |
+
"criterion": ["gini", "entropy"],
|
111 |
+
"n_estimators": [50, 100, 150],
|
112 |
+
"max_depth": [None, 5, 10, 15],
|
113 |
+
"min_samples_split": [2, 5, 10],
|
114 |
+
"min_samples_leaf": [1, 2, 4],
|
115 |
+
"n_jobs": [-1]
|
116 |
+
}
|
117 |
+
|
118 |
+
if hyper_params_optimize == "grid_search":
|
119 |
+
best_model = grid_search(params, random_forest_classifier_model, x_train_and_validate, y_train_and_validate)
|
120 |
+
elif hyper_params_optimize == "bayes_search":
|
121 |
+
best_model = bayes_search(params, random_forest_classifier_model, x_train_and_validate, y_train_and_validate)
|
122 |
+
else:
|
123 |
+
best_model = random_forest_classifier_model
|
124 |
+
for epoch in train_and_validate_data_list:
|
125 |
+
# TODO
|
126 |
+
x_train, x_validate, y_train, y_validate = epoch
|
127 |
+
|
128 |
+
best_model.fit(x_train, y_train)
|
129 |
+
|
130 |
+
info["random forest Params"] = best_model.get_params()
|
131 |
+
|
132 |
+
y_pred = best_model.predict(x_test)
|
133 |
+
|
134 |
+
# 0202:
|
135 |
+
|
136 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="accuracy")
|
137 |
+
|
138 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
139 |
+
train_scores_std = np.std(train_scores, axis=1)
|
140 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
141 |
+
test_scores_std = np.std(test_scores, axis=1)
|
142 |
+
|
143 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
144 |
+
|
145 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "random forest classifier model residual plot")
|
146 |
+
|
147 |
+
# info.update(calculate_regression_metrics(y_pred, y_test, "random forest classifier"))
|
148 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "random forest classifier"))
|
149 |
+
|
150 |
+
f1_score, fpr, tpr, thresholds = calculate_classification_metrics(y_pred, y_test, "random forest")
|
151 |
+
|
152 |
+
return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std, f1_score, fpr, tpr, thresholds
|
153 |
+
|
154 |
+
|
155 |
+
# xgboost classifier
|
156 |
+
def xgboost_classifier(x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
|
157 |
+
info = {}
|
158 |
+
|
159 |
+
xgboost_classifier_model = XGBClassifier(random_state=Config.RANDOM_STATE)
|
160 |
+
params = {
|
161 |
+
"n_estimators": [50, 100, 150],
|
162 |
+
"learning_rate": [0.01, 0.1, 0.2],
|
163 |
+
"max_depth": [3, 4, 5],
|
164 |
+
"min_child_weight": [1, 2, 3],
|
165 |
+
"gamma": [0, 0.1, 0.2],
|
166 |
+
"subsample": [0.8, 0.9, 1.0],
|
167 |
+
"colsample_bytree": [0.8, 0.9, 1.0]
|
168 |
+
}
|
169 |
+
|
170 |
+
if hyper_params_optimize == "grid_search":
|
171 |
+
best_model = grid_search(params, xgboost_classifier_model, x_train_and_validate, y_train_and_validate)
|
172 |
+
elif hyper_params_optimize == "bayes_search":
|
173 |
+
best_model = bayes_search(params, xgboost_classifier_model, x_train_and_validate, y_train_and_validate)
|
174 |
+
else:
|
175 |
+
best_model = xgboost_classifier_model
|
176 |
+
for epoch in train_and_validate_data_list:
|
177 |
+
# TODO
|
178 |
+
x_train, x_validate, y_train, y_validate = epoch
|
179 |
+
|
180 |
+
best_model.fit(x_train, y_train)
|
181 |
+
|
182 |
+
info["xgboost Params"] = best_model.get_params()
|
183 |
+
|
184 |
+
y_pred = best_model.predict(x_test)
|
185 |
+
|
186 |
+
# 0202:
|
187 |
+
|
188 |
+
train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="accuracy")
|
189 |
+
|
190 |
+
train_scores_mean = np.mean(train_scores, axis=1)
|
191 |
+
train_scores_std = np.std(train_scores, axis=1)
|
192 |
+
test_scores_mean = np.mean(test_scores, axis=1)
|
193 |
+
test_scores_std = np.std(test_scores, axis=1)
|
194 |
+
|
195 |
+
# draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)
|
196 |
+
|
197 |
+
# draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "xgboost classifier model residual plot")
|
198 |
+
|
199 |
+
# info.update(calculate_regression_metrics(y_pred, y_test, "xgboost classifier"))
|
200 |
+
# info.update(calculate_classification_metrics(y_pred, y_test, "xgboost classifier"))
|
201 |
+
|
202 |
+
f1_score, fpr, tpr, thresholds = calculate_classification_metrics(y_pred, y_test, "xgboost")
|
203 |
+
|
204 |
+
return info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std, f1_score, fpr, tpr, thresholds
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
|
analysis/two_exponential_smoothing_model.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
|
3 |
+
|
4 |
+
# 双指数平滑
|
5 |
+
def double_exponential_smoothing(series, alpha, beta):
|
6 |
+
"""
|
7 |
+
series - dataset with timeseries
|
8 |
+
alpha - float [0.0, 1.0], smoothing parameter for level
|
9 |
+
beta - float [0.0, 1.0], smoothing parameter for trend
|
10 |
+
"""
|
11 |
+
# first value is same as series
|
12 |
+
result = [series[0]]
|
13 |
+
for n in range(1, len(series) + 1):
|
14 |
+
if n == 1:
|
15 |
+
level, trend = series[0], series[1] - series[0]
|
16 |
+
if n >= len(series): # forecasting
|
17 |
+
value = result[-1]
|
18 |
+
else:
|
19 |
+
value = series[n]
|
20 |
+
last_level, level = level, alpha * value + (1 - alpha) * (level + trend)
|
21 |
+
trend = beta * (level - last_level) + (1 - beta) * trend
|
22 |
+
result.append(level + trend)
|
23 |
+
return result
|
24 |
+
|
25 |
+
|
26 |
+
def plotDoubleExponentialSmoothing(series, alphas, betas):
|
27 |
+
"""
|
28 |
+
Plots double exponential smoothing with different alphas and betas
|
29 |
+
|
30 |
+
series - dataset with timestamps
|
31 |
+
alphas - list of floats, smoothing parameters for level
|
32 |
+
betas - list of floats, smoothing parameters for trend
|
33 |
+
"""
|
34 |
+
|
35 |
+
with plt.style.context('seaborn-white'):
|
36 |
+
plt.figure(figsize=(13, 5))
|
37 |
+
for alpha in alphas:
|
38 |
+
for beta in betas:
|
39 |
+
plt.plot(double_exponential_smoothing(series, alpha, beta),
|
40 |
+
label="Alpha {}, beta {}".format(alpha, beta))
|
41 |
+
plt.plot(series.values, label="Actual")
|
42 |
+
plt.legend(loc="best")
|
43 |
+
plt.axis('tight')
|
44 |
+
plt.title("Double Exponential Smoothing")
|
45 |
+
plt.grid(True)
|
46 |
+
|
47 |
+
|
48 |
+
plotDoubleExponentialSmoothing(data['trend'], alphas=[0.5, 0.3], betas=[0.9, 0.3])
|
app.py
ADDED
@@ -0,0 +1,848 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import os.path
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from sklearn import preprocessing
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from analysis.shap_model import shap_calculate
|
11 |
+
from static.process import *
|
12 |
+
from analysis.linear_model import *
|
13 |
+
from visualization.draw_learning_curve_total import draw_learning_curve_total
|
14 |
+
|
15 |
+
import warnings
|
16 |
+
warnings.filterwarnings("ignore")
|
17 |
+
|
18 |
+
|
19 |
+
class Container:
|
20 |
+
def __init__(self, x_train=None, y_train=None, x_test=None, y_test=None, hyper_params_optimize=None):
|
21 |
+
self.x_train = x_train
|
22 |
+
self.y_train = y_train
|
23 |
+
self.x_test = x_test
|
24 |
+
self.y_test = y_test
|
25 |
+
self.hyper_params_optimize = hyper_params_optimize
|
26 |
+
self.info = dict()
|
27 |
+
self.y_pred = None
|
28 |
+
self.train_sizes = None
|
29 |
+
self.train_scores_mean = None
|
30 |
+
self.train_scores_std = None
|
31 |
+
self.test_scores_mean = None
|
32 |
+
self.test_scores_std = None
|
33 |
+
self.status = None
|
34 |
+
self.model = None
|
35 |
+
|
36 |
+
def set_info(self, info: dict):
|
37 |
+
self.info = info
|
38 |
+
|
39 |
+
def set_y_pred(self, y_pred):
|
40 |
+
self.y_pred = y_pred
|
41 |
+
|
42 |
+
def get_learning_curve_values(self):
|
43 |
+
return [
|
44 |
+
self.train_sizes,
|
45 |
+
self.train_scores_mean,
|
46 |
+
self.train_scores_std,
|
47 |
+
self.test_scores_mean,
|
48 |
+
self.test_scores_std
|
49 |
+
]
|
50 |
+
|
51 |
+
def set_learning_curve_values(self, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std):
|
52 |
+
self.train_sizes = train_sizes
|
53 |
+
self.train_scores_mean = train_scores_mean
|
54 |
+
self.train_scores_std = train_scores_std
|
55 |
+
self.test_scores_mean = test_scores_mean
|
56 |
+
self.test_scores_std = test_scores_std
|
57 |
+
|
58 |
+
def get_status(self):
|
59 |
+
return self.status
|
60 |
+
|
61 |
+
def set_status(self, status: str):
|
62 |
+
self.status = status
|
63 |
+
|
64 |
+
def get_model(self):
|
65 |
+
return self.model
|
66 |
+
|
67 |
+
def set_model(self, model):
|
68 |
+
self.model = model
|
69 |
+
|
70 |
+
|
71 |
+
class FilePath:
|
72 |
+
base = "../diagram/{}.png"
|
73 |
+
shap_beeswarm_plot = "shap_beeswarm_plot"
|
74 |
+
|
75 |
+
|
76 |
+
class MN: # ModelName
|
77 |
+
classification = "classification"
|
78 |
+
regression = "regression"
|
79 |
+
linear_regression = "linear_regression"
|
80 |
+
polynomial_regression = "polynomial_regression"
|
81 |
+
logistic_regression = "logistic_regression"
|
82 |
+
|
83 |
+
|
84 |
+
class LN: # LabelName
|
85 |
+
choose_dataset_radio = "选择所需数据源 [必选]"
|
86 |
+
display_total_col_num_text = "总列数"
|
87 |
+
display_total_row_num_text = "总行数"
|
88 |
+
display_na_list_text = "存在缺失值的列"
|
89 |
+
del_all_na_col_button = "删除所有存在缺失值的列 [可选]"
|
90 |
+
display_duplicate_num_text = "重复的行数"
|
91 |
+
del_col_checkboxgroup = "选择所需删除的列"
|
92 |
+
del_col_button = "删除 [可选]"
|
93 |
+
remain_row_slider = "保留的行数"
|
94 |
+
remain_row_button = "保留 [可选]"
|
95 |
+
del_duplicate_button = "删除所有重复行 [可选]"
|
96 |
+
encode_label_checkboxgroup = "选择所需标签编码的字符型数值列"
|
97 |
+
display_encode_label_dataframe = "标签编码信息"
|
98 |
+
encode_label_button = "字符型转数值型 [可选]"
|
99 |
+
change_data_type_to_float_button = "将所有数据强制转换为浮点型(除第1列以外)[必选]"
|
100 |
+
standardize_data_checkboxgroup = "选择所需标准化的列"
|
101 |
+
standardize_data_button = "标准化 [可选]"
|
102 |
+
select_as_y_radio = "选择因变量 [必选]"
|
103 |
+
choose_assign_radio = "选择任务类型(同时会根据任务类型将第1列数据强制转换)[必选]"
|
104 |
+
linear_regression_model_radio = "选择线性回归的模型"
|
105 |
+
model_optimize_radio = "选择超参数优化方法"
|
106 |
+
model_train_button = "训练"
|
107 |
+
learning_curve_checkboxgroup = "选择所需绘制学习曲线的模型"
|
108 |
+
learning_curve_train_button = "绘制训练集学习曲线"
|
109 |
+
learning_curve_validation_button = "绘制验证集学习曲线"
|
110 |
+
learning_curve_train_plot = "绘制训练集学习曲线"
|
111 |
+
learning_curve_validation_plot = "绘制验证集学习曲线"
|
112 |
+
shap_beeswarm_radio = "选择所需绘制蜂群特征图的模型"
|
113 |
+
shap_beeswarm_button = "绘制蜂群特征图"
|
114 |
+
shap_beeswarm_plot = "蜂群特征图"
|
115 |
+
select_as_model_radio = "选择所需训练的模型"
|
116 |
+
|
117 |
+
|
118 |
+
def get_outputs():
|
119 |
+
gr_dict = {
|
120 |
+
choose_custom_dataset_file,
|
121 |
+
display_dataset_dataframe,
|
122 |
+
display_total_col_num_text,
|
123 |
+
display_total_row_num_text,
|
124 |
+
display_na_list_text,
|
125 |
+
del_all_na_col_button,
|
126 |
+
display_duplicate_num_text,
|
127 |
+
del_duplicate_button,
|
128 |
+
del_col_checkboxgroup,
|
129 |
+
del_col_button,
|
130 |
+
remain_row_slider,
|
131 |
+
remain_row_button,
|
132 |
+
encode_label_button,
|
133 |
+
display_encode_label_dataframe,
|
134 |
+
encode_label_checkboxgroup,
|
135 |
+
data_type_dataframe,
|
136 |
+
change_data_type_to_float_button,
|
137 |
+
standardize_data_checkboxgroup,
|
138 |
+
standardize_data_button,
|
139 |
+
select_as_y_radio,
|
140 |
+
linear_regression_model_radio,
|
141 |
+
model_optimize_radio,
|
142 |
+
model_train_button,
|
143 |
+
model_train_checkbox,
|
144 |
+
learning_curve_checkboxgroup,
|
145 |
+
learning_curve_train_button,
|
146 |
+
learning_curve_validation_button,
|
147 |
+
learning_curve_train_plot,
|
148 |
+
learning_curve_validation_plot,
|
149 |
+
shap_beeswarm_radio,
|
150 |
+
shap_beeswarm_button,
|
151 |
+
shap_beeswarm_plot,
|
152 |
+
shap_beeswarm_plot_file,
|
153 |
+
select_as_model_radio,
|
154 |
+
choose_assign_radio,
|
155 |
+
}
|
156 |
+
|
157 |
+
return gr_dict
|
158 |
+
|
159 |
+
|
160 |
+
def get_return(is_visible, extra_gr_dict: dict = None):
|
161 |
+
if is_visible:
|
162 |
+
gr_dict = {
|
163 |
+
display_dataset_dataframe: gr.Dataframe(add_index_into_df(Dataset.data), type="pandas", visible=True),
|
164 |
+
display_total_col_num_text: gr.Textbox(str(Dataset.get_total_col_num()), visible=True, label=LN.display_total_col_num_text),
|
165 |
+
display_total_row_num_text: gr.Textbox(str(Dataset.get_total_row_num()), visible=True, label=LN.display_total_row_num_text),
|
166 |
+
display_na_list_text: gr.Textbox(Dataset.get_na_list_str(), visible=True, label=LN.display_na_list_text),
|
167 |
+
del_all_na_col_button: gr.Button(LN.del_all_na_col_button, visible=True),
|
168 |
+
display_duplicate_num_text: gr.Textbox(str(Dataset.get_duplicate_num()), visible=True, label=LN.display_duplicate_num_text),
|
169 |
+
del_duplicate_button: gr.Button(LN.del_duplicate_button, visible=True),
|
170 |
+
del_col_checkboxgroup: gr.Checkboxgroup(Dataset.get_col_list(), visible=True, label=LN.del_col_checkboxgroup),
|
171 |
+
del_col_button: gr.Button(LN.del_col_button, visible=True),
|
172 |
+
remain_row_slider: gr.Slider(0, Dataset.get_max_num(), value=Dataset.get_total_row_num(), step=1, visible=True, label=LN.remain_row_slider),
|
173 |
+
remain_row_button: gr.Button(LN.remain_row_button, visible=True),
|
174 |
+
encode_label_button: gr.Button(LN.encode_label_button, visible=True),
|
175 |
+
encode_label_checkboxgroup: gr.Checkboxgroup(Dataset.get_non_numeric_list(), visible=True, label=LN.encode_label_checkboxgroup),
|
176 |
+
display_encode_label_dataframe: gr.Dataframe(visible=False),
|
177 |
+
data_type_dataframe: gr.Dataframe(Dataset.get_data_type(), visible=True),
|
178 |
+
change_data_type_to_float_button: gr.Button(LN.change_data_type_to_float_button, visible=True),
|
179 |
+
select_as_y_radio: gr.Radio(Dataset.get_col_list(), visible=True, label=LN.select_as_y_radio),
|
180 |
+
standardize_data_checkboxgroup: gr.Checkboxgroup(Dataset.get_non_standardized_data(), visible=True, label=LN.standardize_data_checkboxgroup),
|
181 |
+
standardize_data_button: gr.Button(LN.standardize_data_button, visible=True),
|
182 |
+
choose_assign_radio: gr.Radio(Dataset.get_assign_list(), visible=True, label=LN.choose_assign_radio),
|
183 |
+
|
184 |
+
select_as_model_radio: gr.Radio(Dataset.get_model_list(), visible=Dataset.check_before_train(), label=LN.select_as_model_radio),
|
185 |
+
model_optimize_radio: gr.Radio(Dataset.get_optimize_list(), visible=Dataset.check_before_train(), label=LN.model_optimize_radio),
|
186 |
+
|
187 |
+
linear_regression_model_radio: gr.Radio(Dataset.get_linear_regression_model_list(), visible=Dataset.get_linear_regression_mark(), label=LN.linear_regression_model_radio),
|
188 |
+
|
189 |
+
model_train_button: gr.Button(LN.model_train_button, visible=Dataset.check_before_train()),
|
190 |
+
model_train_checkbox: gr.Checkbox(Dataset.get_model_container_status(), visible=Dataset.check_select_model(), label=Dataset.get_model_label()),
|
191 |
+
learning_curve_checkboxgroup: gr.Checkboxgroup(Dataset.get_trained_model_list(), visible=Dataset.check_before_train(), label=LN.learning_curve_checkboxgroup),
|
192 |
+
learning_curve_train_button: gr.Button(LN.learning_curve_train_button, visible=Dataset.check_before_train()),
|
193 |
+
learning_curve_validation_button: gr.Button(LN.learning_curve_validation_button, visible=Dataset.check_before_train()),
|
194 |
+
shap_beeswarm_radio: gr.Radio(Dataset.get_trained_model_list(), visible=Dataset.check_before_train(), label=LN.shap_beeswarm_radio),
|
195 |
+
shap_beeswarm_button: gr.Button(LN.shap_beeswarm_button, visible=Dataset.check_before_train()),
|
196 |
+
shap_beeswarm_plot_file: gr.File(Dataset.after_get_shap_beeswarm_plot_file(), visible=Dataset.check_shap_beeswarm_plot_file()),
|
197 |
+
}
|
198 |
+
|
199 |
+
if extra_gr_dict:
|
200 |
+
gr_dict.update(extra_gr_dict)
|
201 |
+
|
202 |
+
return gr_dict
|
203 |
+
|
204 |
+
gr_dict = {
|
205 |
+
choose_custom_dataset_file: gr.File(None, visible=True),
|
206 |
+
display_dataset_dataframe: gr.Dataframe(visible=False),
|
207 |
+
display_total_col_num_text: gr.Textbox(visible=False),
|
208 |
+
display_total_row_num_text: gr.Textbox(visible=False),
|
209 |
+
display_na_list_text: gr.Textbox(visible=False),
|
210 |
+
del_all_na_col_button: gr.Button(visible=False),
|
211 |
+
display_duplicate_num_text: gr.Textbox(visible=False),
|
212 |
+
del_duplicate_button: gr.Button(visible=False),
|
213 |
+
del_col_checkboxgroup: gr.Checkboxgroup(visible=False),
|
214 |
+
del_col_button: gr.Button(visible=False),
|
215 |
+
remain_row_slider: gr.Slider(visible=False),
|
216 |
+
encode_label_button: gr.Button(visible=False),
|
217 |
+
display_encode_label_dataframe: gr.Dataframe(visible=False),
|
218 |
+
encode_label_checkboxgroup: gr.Checkboxgroup(visible=False),
|
219 |
+
data_type_dataframe: gr.Dataframe(visible=False),
|
220 |
+
change_data_type_to_float_button: gr.Button(visible=False),
|
221 |
+
standardize_data_checkboxgroup: gr.Checkboxgroup(visible=False),
|
222 |
+
standardize_data_button: gr.Button(visible=False),
|
223 |
+
select_as_y_radio: gr.Radio(visible=False),
|
224 |
+
linear_regression_model_radio: gr.Radio(visible=False),
|
225 |
+
model_optimize_radio: gr.Radio(visible=False),
|
226 |
+
model_train_button: gr.Button(visible=False),
|
227 |
+
model_train_checkbox: gr.Checkbox(visible=False),
|
228 |
+
learning_curve_checkboxgroup: gr.Checkboxgroup(visible=False),
|
229 |
+
learning_curve_train_button: gr.Button(visible=False),
|
230 |
+
learning_curve_validation_button: gr.Button(visible=False),
|
231 |
+
learning_curve_train_plot: gr.Plot(visible=False),
|
232 |
+
learning_curve_validation_plot: gr.Plot(visible=False),
|
233 |
+
shap_beeswarm_radio: gr.Radio(visible=False),
|
234 |
+
shap_beeswarm_button: gr.Button(visible=False),
|
235 |
+
shap_beeswarm_plot: gr.Plot(visible=False),
|
236 |
+
shap_beeswarm_plot_file: gr.File(visible=False),
|
237 |
+
select_as_model_radio: gr.Radio(visible=False),
|
238 |
+
choose_assign_radio: gr.Radio(visible=False),
|
239 |
+
}
|
240 |
+
|
241 |
+
return gr_dict
|
242 |
+
|
243 |
+
|
244 |
+
class Dataset:
|
245 |
+
file = ""
|
246 |
+
data = pd.DataFrame()
|
247 |
+
|
248 |
+
na_list = []
|
249 |
+
non_numeric_list = []
|
250 |
+
str2int_mappings = {}
|
251 |
+
max_num = 0
|
252 |
+
data_copy = pd.DataFrame()
|
253 |
+
assign = ""
|
254 |
+
cur_model = ""
|
255 |
+
select_y_mark = False
|
256 |
+
|
257 |
+
container_dict = {
|
258 |
+
MN.linear_regression: Container(),
|
259 |
+
MN.polynomial_regression: Container(),
|
260 |
+
MN.logistic_regression: Container(),
|
261 |
+
}
|
262 |
+
|
263 |
+
@classmethod
|
264 |
+
def get_dataset_list(cls):
|
265 |
+
return ["Iris Dataset", "Wine Dataset", "Breast Cancer Dataset", "自定义"]
|
266 |
+
|
267 |
+
@classmethod
|
268 |
+
def get_col_list(cls):
|
269 |
+
return [x for x in cls.data.columns.values]
|
270 |
+
|
271 |
+
@classmethod
|
272 |
+
def get_na_list_str(cls) -> str:
|
273 |
+
na_series = cls.data.isna().any(axis=0)
|
274 |
+
na_list = []
|
275 |
+
na_list_str = ""
|
276 |
+
for i in range(len(na_series)):
|
277 |
+
cur_value = na_series[i]
|
278 |
+
cur_index = na_series.index[i]
|
279 |
+
if cur_value:
|
280 |
+
na_list_str += cur_index + ", "
|
281 |
+
na_list.append(cur_index)
|
282 |
+
|
283 |
+
na_list_str = na_list_str.rstrip(", ")
|
284 |
+
|
285 |
+
cls.na_list = na_list
|
286 |
+
|
287 |
+
if not na_list:
|
288 |
+
return "无"
|
289 |
+
|
290 |
+
return na_list_str
|
291 |
+
|
292 |
+
@classmethod
|
293 |
+
def get_total_col_num(cls) -> int:
|
294 |
+
return len(cls.data.columns)
|
295 |
+
|
296 |
+
@classmethod
|
297 |
+
def get_total_row_num(cls) -> int:
|
298 |
+
return len(cls.data)
|
299 |
+
|
300 |
+
@classmethod
|
301 |
+
def update(cls, file: str, data: pd.DataFrame):
|
302 |
+
cls.file = file
|
303 |
+
cls.data = data
|
304 |
+
cls.max_num = len(data)
|
305 |
+
cls.data_copy = data
|
306 |
+
|
307 |
+
@classmethod
|
308 |
+
def clear(cls):
|
309 |
+
cls.file = ""
|
310 |
+
cls.data = pd.DataFrame()
|
311 |
+
|
312 |
+
@classmethod
|
313 |
+
def del_col(cls, col_list: list):
|
314 |
+
for col in col_list:
|
315 |
+
if col in cls.data.columns.values:
|
316 |
+
cls.data.drop(col, axis=1, inplace=True)
|
317 |
+
|
318 |
+
@classmethod
|
319 |
+
def get_max_num(cls):
|
320 |
+
return cls.max_num
|
321 |
+
|
322 |
+
@classmethod
|
323 |
+
def remain_row(cls, num):
|
324 |
+
cls.data = cls.data_copy.iloc[:num, :]
|
325 |
+
|
326 |
+
@classmethod
|
327 |
+
def del_all_na_col(cls):
|
328 |
+
for col in cls.na_list:
|
329 |
+
if col in cls.data.columns.values:
|
330 |
+
cls.data.drop(col, axis=1, inplace=True)
|
331 |
+
|
332 |
+
@classmethod
|
333 |
+
def get_duplicate_num(cls):
|
334 |
+
data_copy = copy.deepcopy(cls.data)
|
335 |
+
return len(cls.data) - len(data_copy.drop_duplicates())
|
336 |
+
|
337 |
+
@classmethod
|
338 |
+
def del_duplicate(cls):
|
339 |
+
cls.data = cls.data.drop_duplicates().reset_index().drop("index", axis=1)
|
340 |
+
|
341 |
+
@classmethod
|
342 |
+
def encode_label(cls, col_list: list, extra_mark=False):
|
343 |
+
data_copy = copy.deepcopy(cls.data)
|
344 |
+
|
345 |
+
str2int_mappings = dict(zip(col_list, [{} for _ in range(len(col_list))]))
|
346 |
+
|
347 |
+
for col in str2int_mappings.keys():
|
348 |
+
keys = np.array(data_copy[col].drop_duplicates())
|
349 |
+
values = [x for x in range(len(keys))]
|
350 |
+
str2int_mappings[col] = dict(zip(keys, values))
|
351 |
+
|
352 |
+
for col, mapping in str2int_mappings.items():
|
353 |
+
series = data_copy[col]
|
354 |
+
|
355 |
+
for k, v in mapping.items():
|
356 |
+
series.replace(k, v, inplace=True)
|
357 |
+
data_copy[col] = series
|
358 |
+
|
359 |
+
for k, v in str2int_mappings.items():
|
360 |
+
if np.nan in v.keys():
|
361 |
+
v.update({"nan": v.pop(np.nan)})
|
362 |
+
str2int_mappings[k] = v
|
363 |
+
|
364 |
+
if extra_mark:
|
365 |
+
return data_copy
|
366 |
+
else:
|
367 |
+
cls.data = data_copy
|
368 |
+
cls.str2int_mappings = str2int_mappings
|
369 |
+
|
370 |
+
@classmethod
|
371 |
+
def get_str2int_mappings_df(cls):
|
372 |
+
columns_list = ["列名", "���符型", "数值型"]
|
373 |
+
str2int_mappings_df = pd.DataFrame(columns=columns_list)
|
374 |
+
|
375 |
+
for k, v in cls.str2int_mappings.items():
|
376 |
+
cur_df = pd.DataFrame(columns=columns_list)
|
377 |
+
cur_df["列名"] = pd.DataFrame([k] * len(v.keys()))
|
378 |
+
cur_df["字符型"] = pd.DataFrame([x for x in v.keys()])
|
379 |
+
cur_df["数值型"] = pd.DataFrame([x for x in v.values()])
|
380 |
+
|
381 |
+
str2int_mappings_df = pd.concat([str2int_mappings_df, cur_df], axis=0)
|
382 |
+
|
383 |
+
blank_df = pd.DataFrame(columns=columns_list)
|
384 |
+
blank_df.loc[0] = ["", "", ""]
|
385 |
+
str2int_mappings_df = pd.concat([str2int_mappings_df, blank_df], axis=0)
|
386 |
+
|
387 |
+
return str2int_mappings_df.iloc[:-1, :]
|
388 |
+
|
389 |
+
@classmethod
|
390 |
+
def get_non_numeric_list(cls):
|
391 |
+
data_copy = copy.deepcopy(cls.data)
|
392 |
+
data_copy = data_copy.astype(str)
|
393 |
+
|
394 |
+
non_numeric_list = []
|
395 |
+
for col in data_copy.columns.values:
|
396 |
+
if pd.to_numeric(data_copy[col], errors="coerce").isnull().values.any():
|
397 |
+
non_numeric_list.append(col)
|
398 |
+
|
399 |
+
cls.non_numeric_list = non_numeric_list
|
400 |
+
|
401 |
+
return non_numeric_list
|
402 |
+
|
403 |
+
@classmethod
|
404 |
+
def get_data_type(cls):
|
405 |
+
columns_list = ["列名", "数据类型"]
|
406 |
+
|
407 |
+
data_type_dict = {}
|
408 |
+
|
409 |
+
for col in cls.data.columns.values:
|
410 |
+
data_type_dict[col] = cls.data[col].dtype.name
|
411 |
+
|
412 |
+
data_type_df = pd.DataFrame(columns=columns_list)
|
413 |
+
data_type_df["列名"] = [x for x in data_type_dict.keys()]
|
414 |
+
data_type_df["数据类型"] = [x for x in data_type_dict.values()]
|
415 |
+
|
416 |
+
return data_type_df
|
417 |
+
|
418 |
+
@classmethod
|
419 |
+
def change_data_type_to_float(cls):
|
420 |
+
data_copy = cls.data
|
421 |
+
|
422 |
+
for i, col in enumerate(data_copy.columns.values):
|
423 |
+
if i != 0:
|
424 |
+
data_copy[col] = data_copy[col].astype(float)
|
425 |
+
|
426 |
+
cls.data = data_copy
|
427 |
+
|
428 |
+
@classmethod
|
429 |
+
def get_non_standardized_data(cls):
|
430 |
+
not_standardized_data_list = []
|
431 |
+
|
432 |
+
for col in cls.data.columns.values:
|
433 |
+
if cls.data[col].dtype.name in ["int64", "float64"]:
|
434 |
+
if not np.array_equal(np.round(preprocessing.scale(cls.data[col]), decimals=2), np.round(cls.data[col].values.round(2), decimals=2)):
|
435 |
+
not_standardized_data_list.append(col)
|
436 |
+
|
437 |
+
return not_standardized_data_list
|
438 |
+
|
439 |
+
@classmethod
|
440 |
+
def check_before_train(cls):
|
441 |
+
if cls.assign == "" or not cls.select_y_mark:
|
442 |
+
return False
|
443 |
+
|
444 |
+
for i, col in enumerate(cls.data.columns.values):
|
445 |
+
if i == 0:
|
446 |
+
if not (all(isinstance(x, str) for x in cls.data.iloc[:, 0]) or all(isinstance(x, float) for x in cls.data.iloc[:, 0])):
|
447 |
+
return False
|
448 |
+
else:
|
449 |
+
if cls.data[col].dtype.name != "float64":
|
450 |
+
return False
|
451 |
+
|
452 |
+
return True
|
453 |
+
|
454 |
+
@classmethod
|
455 |
+
def standardize_data(cls, col_list: list):
|
456 |
+
for col in col_list:
|
457 |
+
cls.data[col] = preprocessing.scale(cls.data[col])
|
458 |
+
|
459 |
+
@classmethod
|
460 |
+
def select_as_y(cls, col: str):
|
461 |
+
cls.data = pd.concat([cls.data[col], cls.data.drop(col, axis=1)], axis=1)
|
462 |
+
cls.select_y_mark = True
|
463 |
+
|
464 |
+
@classmethod
|
465 |
+
def get_optimize_list(cls):
|
466 |
+
return ["无", "网格搜索", "贝叶斯优化"]
|
467 |
+
|
468 |
+
@classmethod
|
469 |
+
def get_optimize_name_mapping(cls):
|
470 |
+
return dict(zip(cls.get_optimize_list(), ["None", "grid_search", "bayes_search"]))
|
471 |
+
|
472 |
+
@classmethod
|
473 |
+
def get_linear_regression_model_list(cls):
|
474 |
+
return ["线性回归", "Lasso回归", "Ridge回归", "弹性网络回归"]
|
475 |
+
|
476 |
+
@classmethod
|
477 |
+
def get_linear_regression_model_name_mapping(cls):
|
478 |
+
return dict(zip(cls.get_linear_regression_model_list(), ["LinearRegression", "Lasso", "Ridge", "ElasticNet"]))
|
479 |
+
|
480 |
+
@classmethod
|
481 |
+
def train_model(cls, optimize, linear_regression_model_type=None):
|
482 |
+
optimize = cls.get_optimize_name_mapping()[optimize]
|
483 |
+
|
484 |
+
data_copy = cls.data
|
485 |
+
if cls.assign == MN.classification:
|
486 |
+
data_copy = cls.encode_label([cls.data.columns.values[0]], True)
|
487 |
+
|
488 |
+
x_train, x_test, y_train, y_test = train_test_split(
|
489 |
+
data_copy.values[:, 1:],
|
490 |
+
data_copy.values[:, :1],
|
491 |
+
random_state=Config.RANDOM_STATE,
|
492 |
+
train_size=0.8
|
493 |
+
)
|
494 |
+
container = Container(x_train, y_train, x_test, y_test, optimize)
|
495 |
+
|
496 |
+
if cls.cur_model == MN.linear_regression:
|
497 |
+
container = linear_regression(container, cls.get_linear_regression_model_name_mapping()[linear_regression_model_type])
|
498 |
+
elif cls.cur_model == MN.polynomial_regression:
|
499 |
+
container = polynomial_regression(container)
|
500 |
+
elif cls.cur_model == MN.logistic_regression:
|
501 |
+
container = logistic_regression(container)
|
502 |
+
|
503 |
+
cls.container_dict[cls.cur_model] = container
|
504 |
+
|
505 |
+
@classmethod
|
506 |
+
def get_model_container_status(cls):
|
507 |
+
return True if cls.cur_model != "" and cls.container_dict[cls.cur_model].get_status() == "trained" else False
|
508 |
+
|
509 |
+
@classmethod
|
510 |
+
def get_model_label(cls):
|
511 |
+
return str(cls.get_model_name_mapping()[cls.cur_model]) + "模型是否完成训练" if cls.cur_model != "" else ""
|
512 |
+
|
513 |
+
@classmethod
|
514 |
+
def check_select_model(cls):
|
515 |
+
return True if cls.cur_model != "" and cls.check_before_train() else False
|
516 |
+
|
517 |
+
@classmethod
|
518 |
+
def get_model_name(cls):
|
519 |
+
return [x for x in cls.container_dict.keys()]
|
520 |
+
|
521 |
+
@classmethod
|
522 |
+
def get_model_chinese_name(cls):
|
523 |
+
return ["线性回归", "多项式回归", "逻辑斯谛分类"]
|
524 |
+
|
525 |
+
@classmethod
|
526 |
+
def get_model_name_mapping(cls):
|
527 |
+
return dict(zip(cls.get_model_name(), cls.get_model_chinese_name()))
|
528 |
+
|
529 |
+
@classmethod
|
530 |
+
def get_model_name_mapping_reverse(cls):
|
531 |
+
return dict(zip(cls.get_model_chinese_name(), cls.get_model_name()))
|
532 |
+
|
533 |
+
@classmethod
|
534 |
+
def get_trained_model_list(cls):
|
535 |
+
trained_model_list = []
|
536 |
+
|
537 |
+
for model_name, container in cls.container_dict.items():
|
538 |
+
if container.get_status() == "trained":
|
539 |
+
trained_model_list.append(cls.get_model_name_mapping()[model_name])
|
540 |
+
|
541 |
+
return trained_model_list
|
542 |
+
|
543 |
+
@classmethod
|
544 |
+
def draw_learning_curve_train_plot(cls, model_list: list) -> plt.Figure:
|
545 |
+
learning_curve_dict = {}
|
546 |
+
|
547 |
+
for model_name in model_list:
|
548 |
+
model_name = cls.get_model_name_mapping_reverse()[model_name]
|
549 |
+
learning_curve_dict[model_name] = cls.container_dict[model_name].get_learning_curve_values()
|
550 |
+
|
551 |
+
return draw_learning_curve_total(learning_curve_dict, "train")
|
552 |
+
|
553 |
+
@classmethod
|
554 |
+
def draw_learning_curve_validation_plot(cls, model_list: list) -> plt.Figure:
|
555 |
+
learning_curve_dict = {}
|
556 |
+
|
557 |
+
for model_name in model_list:
|
558 |
+
model_name = cls.get_model_name_mapping_reverse()[model_name]
|
559 |
+
learning_curve_dict[model_name] = cls.container_dict[model_name].get_learning_curve_values()
|
560 |
+
|
561 |
+
return draw_learning_curve_total(learning_curve_dict, "validation")
|
562 |
+
|
563 |
+
@classmethod
|
564 |
+
def draw_shap_beeswarm_plot(cls, model_name) -> plt.Figure:
|
565 |
+
model_name = cls.get_model_name_mapping_reverse()[model_name]
|
566 |
+
container = cls.container_dict[model_name]
|
567 |
+
|
568 |
+
return shap_calculate(container.get_model(), container.x_train, cls.data.columns.values)
|
569 |
+
|
570 |
+
@classmethod
|
571 |
+
def get_shap_beeswarm_plot_file(cls):
|
572 |
+
return FilePath.base.format(FilePath.shap_beeswarm_plot)
|
573 |
+
|
574 |
+
@classmethod
|
575 |
+
def check_shap_beeswarm_plot_file(cls):
|
576 |
+
return os.path.exists(cls.get_shap_beeswarm_plot_file())
|
577 |
+
|
578 |
+
@classmethod
|
579 |
+
def after_get_shap_beeswarm_plot_file(cls):
|
580 |
+
return cls.get_shap_beeswarm_plot_file() if cls.check_shap_beeswarm_plot_file() else None
|
581 |
+
|
582 |
+
@classmethod
|
583 |
+
def get_model_list(cls):
|
584 |
+
model_list = []
|
585 |
+
for model_name in cls.container_dict.keys():
|
586 |
+
model_list.append(cls.get_model_name_mapping()[model_name])
|
587 |
+
|
588 |
+
return model_list
|
589 |
+
|
590 |
+
@classmethod
|
591 |
+
def select_as_model(cls, model_name: str):
|
592 |
+
cls.cur_model = cls.get_model_name_mapping_reverse()[model_name]
|
593 |
+
|
594 |
+
@classmethod
|
595 |
+
def get_model_mark(cls):
|
596 |
+
return True if cls.cur_model != "" else False
|
597 |
+
|
598 |
+
@classmethod
|
599 |
+
def get_linear_regression_mark(cls):
|
600 |
+
return True if cls.cur_model == MN.linear_regression else False
|
601 |
+
|
602 |
+
@classmethod
|
603 |
+
def get_assign_list(cls):
|
604 |
+
return ["分类", "回归"]
|
605 |
+
|
606 |
+
@classmethod
|
607 |
+
def get_assign_mapping_reverse(cls):
|
608 |
+
return dict(zip(cls.get_assign_list(), [MN.classification, MN.regression]))
|
609 |
+
|
610 |
+
@classmethod
|
611 |
+
def choose_assign(cls, assign: str):
|
612 |
+
cls.assign = cls.get_assign_mapping_reverse()[assign]
|
613 |
+
|
614 |
+
data_copy = cls.data
|
615 |
+
|
616 |
+
if cls.assign == MN.classification:
|
617 |
+
data_copy.iloc[0, :] = data_copy.iloc[0, :].astype(str)
|
618 |
+
else:
|
619 |
+
data_copy.iloc[0, :] = data_copy.iloc[0, :].astype(float)
|
620 |
+
|
621 |
+
cls.data = data_copy
|
622 |
+
cls.change_data_type_to_float()
|
623 |
+
|
624 |
+
|
625 |
+
def choose_assign(assign: str):
|
626 |
+
Dataset.choose_assign(assign)
|
627 |
+
|
628 |
+
return get_return(True)
|
629 |
+
|
630 |
+
|
631 |
+
def select_as_model(model_name: str):
|
632 |
+
Dataset.select_as_model(model_name)
|
633 |
+
|
634 |
+
return get_return(True)
|
635 |
+
|
636 |
+
|
637 |
+
def draw_shap_beeswarm_plot(model_name):
|
638 |
+
cur_plt = Dataset.draw_shap_beeswarm_plot(model_name)
|
639 |
+
|
640 |
+
cur_plt.savefig(FilePath.base.format(FilePath.shap_beeswarm_plot), dpi=300)
|
641 |
+
|
642 |
+
return get_return(True, {shap_beeswarm_plot: gr.Plot(cur_plt, visible=True, label=LN.shap_beeswarm_plot)})
|
643 |
+
|
644 |
+
|
645 |
+
def draw_learning_curve_validation_plot(model_list: list):
|
646 |
+
cur_plt = Dataset.draw_learning_curve_validation_plot(model_list)
|
647 |
+
|
648 |
+
return get_return(True, {learning_curve_validation_plot: gr.Plot(cur_plt, visible=True, label=LN.learning_curve_validation_plot)})
|
649 |
+
|
650 |
+
|
651 |
+
def draw_learning_curve_train_plot(model_list: list):
|
652 |
+
cur_plt = Dataset.draw_learning_curve_train_plot(model_list)
|
653 |
+
|
654 |
+
return get_return(True, {learning_curve_train_plot: gr.Plot(cur_plt, visible=True, label=LN.learning_curve_train_plot)})
|
655 |
+
|
656 |
+
|
657 |
+
def train_model(optimize, linear_regression_model_type):
|
658 |
+
Dataset.train_model(optimize, linear_regression_model_type)
|
659 |
+
|
660 |
+
return get_return(True)
|
661 |
+
|
662 |
+
|
663 |
+
def select_as_y(col: str):
|
664 |
+
Dataset.select_as_y(col)
|
665 |
+
|
666 |
+
return get_return(True)
|
667 |
+
|
668 |
+
|
669 |
+
def standardize_data(col_list: list):
|
670 |
+
Dataset.standardize_data(col_list)
|
671 |
+
|
672 |
+
return get_return(True)
|
673 |
+
|
674 |
+
|
675 |
+
def change_data_type_to_float():
|
676 |
+
Dataset.change_data_type_to_float()
|
677 |
+
|
678 |
+
return get_return(True)
|
679 |
+
|
680 |
+
|
681 |
+
def encode_label(col_list: list):
|
682 |
+
Dataset.encode_label(col_list)
|
683 |
+
|
684 |
+
return get_return(True, {display_encode_label_dataframe: gr.Dataframe(Dataset.get_str2int_mappings_df(), type="pandas", visible=True, label=LN.display_encode_label_dataframe)})
|
685 |
+
|
686 |
+
|
687 |
+
def del_duplicate():
|
688 |
+
Dataset.del_duplicate()
|
689 |
+
|
690 |
+
return get_return(True)
|
691 |
+
|
692 |
+
|
693 |
+
def del_all_na_col():
|
694 |
+
Dataset.del_all_na_col()
|
695 |
+
|
696 |
+
return get_return(True)
|
697 |
+
|
698 |
+
|
699 |
+
def remain_row(num):
|
700 |
+
Dataset.remain_row(num)
|
701 |
+
|
702 |
+
return get_return(True)
|
703 |
+
|
704 |
+
|
705 |
+
def del_col(col_list: list):
|
706 |
+
Dataset.del_col(col_list)
|
707 |
+
|
708 |
+
return get_return(True)
|
709 |
+
|
710 |
+
|
711 |
+
def add_index_into_df(df: pd.DataFrame) -> pd.DataFrame:
|
712 |
+
if df.empty:
|
713 |
+
return df
|
714 |
+
|
715 |
+
index_df = pd.DataFrame([x for x in range(len(df))], columns=["[*index]"])
|
716 |
+
|
717 |
+
return pd.concat([index_df, df], axis=1)
|
718 |
+
|
719 |
+
|
720 |
+
def choose_dataset(file: str):
|
721 |
+
if file == "自定义":
|
722 |
+
Dataset.clear()
|
723 |
+
|
724 |
+
return get_return(False)
|
725 |
+
|
726 |
+
df = load_data(file)
|
727 |
+
Dataset.update(file, df)
|
728 |
+
|
729 |
+
return get_return(True, {choose_custom_dataset_file: gr.File(visible=False)})
|
730 |
+
|
731 |
+
|
732 |
+
def choose_custom_dataset(file: str):
|
733 |
+
df = load_custom_data(file)
|
734 |
+
Dataset.update(file, df)
|
735 |
+
|
736 |
+
return get_return(True, {choose_custom_dataset_file: gr.File(Dataset.file, visible=True)})
|
737 |
+
|
738 |
+
|
739 |
+
with gr.Blocks() as demo:
|
740 |
+
|
741 |
+
'''
|
742 |
+
组件
|
743 |
+
'''
|
744 |
+
|
745 |
+
with gr.Tab("机器学习"):
|
746 |
+
# 选择数据源
|
747 |
+
with gr.Accordion("数据源"):
|
748 |
+
with gr.Group():
|
749 |
+
choose_dataset_radio = gr.Radio(Dataset.get_dataset_list(), label=LN.choose_dataset_radio)
|
750 |
+
choose_custom_dataset_file = gr.File(visible=False)
|
751 |
+
|
752 |
+
# 显示数据表信息
|
753 |
+
with gr.Accordion("当前数据信息"):
|
754 |
+
display_dataset_dataframe = gr.Dataframe(visible=False)
|
755 |
+
with gr.Row():
|
756 |
+
display_total_col_num_text = gr.Textbox(visible=False)
|
757 |
+
display_total_row_num_text = gr.Textbox(visible=False)
|
758 |
+
with gr.Column():
|
759 |
+
remain_row_slider = gr.Slider(visible=False)
|
760 |
+
remain_row_button = gr.Button(visible=False)
|
761 |
+
with gr.Row():
|
762 |
+
with gr.Column():
|
763 |
+
with gr.Row():
|
764 |
+
display_na_list_text = gr.Textbox(visible=False)
|
765 |
+
display_duplicate_num_text = gr.Textbox(visible=False)
|
766 |
+
with gr.Row():
|
767 |
+
del_all_na_col_button = gr.Button(visible=False)
|
768 |
+
del_duplicate_button = gr.Button(visible=False)
|
769 |
+
|
770 |
+
# 操作数据表
|
771 |
+
with gr.Accordion("数据处理"):
|
772 |
+
select_as_y_radio = gr.Radio(visible=False)
|
773 |
+
with gr.Row():
|
774 |
+
with gr.Column():
|
775 |
+
data_type_dataframe = gr.Dataframe(visible=False)
|
776 |
+
change_data_type_to_float_button = gr.Button(visible=False)
|
777 |
+
choose_assign_radio = gr.Radio(visible=False)
|
778 |
+
with gr.Column():
|
779 |
+
del_col_checkboxgroup = gr.Checkboxgroup(visible=False)
|
780 |
+
del_col_button = gr.Button(visible=False)
|
781 |
+
encode_label_checkboxgroup = gr.Checkboxgroup(visible=False)
|
782 |
+
encode_label_button = gr.Button(visible=False)
|
783 |
+
display_encode_label_dataframe = gr.Dataframe(visible=False)
|
784 |
+
standardize_data_checkboxgroup = gr.Checkboxgroup(visible=False)
|
785 |
+
standardize_data_button = gr.Button(visible=False)
|
786 |
+
|
787 |
+
# 数据模型
|
788 |
+
with gr.Accordion("数据模型"):
|
789 |
+
select_as_model_radio = gr.Radio(visible=False)
|
790 |
+
linear_regression_model_radio = gr.Radio(visible=False)
|
791 |
+
model_optimize_radio = gr.Radio(visible=False)
|
792 |
+
model_train_button = gr.Button(visible=False)
|
793 |
+
model_train_checkbox = gr.Checkbox(visible=False)
|
794 |
+
|
795 |
+
# 可视化
|
796 |
+
with gr.Accordion("数据可视化"):
|
797 |
+
learning_curve_checkboxgroup = gr.Checkboxgroup(visible=False)
|
798 |
+
with gr.Row():
|
799 |
+
learning_curve_train_button = gr.Button(visible=False)
|
800 |
+
learning_curve_validation_button = gr.Button(visible=False)
|
801 |
+
learning_curve_train_plot = gr.Plot(visible=False)
|
802 |
+
learning_curve_validation_plot = gr.Plot(visible=False)
|
803 |
+
shap_beeswarm_radio = gr.Radio(visible=False)
|
804 |
+
shap_beeswarm_button = gr.Button(visible=False)
|
805 |
+
with gr.Group():
|
806 |
+
shap_beeswarm_plot = gr.Plot(visible=False)
|
807 |
+
shap_beeswarm_plot_file = gr.File(visible=False)
|
808 |
+
|
809 |
+
'''
|
810 |
+
监听事件
|
811 |
+
'''
|
812 |
+
|
813 |
+
# 选择数据源
|
814 |
+
choose_dataset_radio.change(fn=choose_dataset, inputs=[choose_dataset_radio], outputs=get_outputs())
|
815 |
+
choose_custom_dataset_file.upload(fn=choose_custom_dataset, inputs=[choose_custom_dataset_file], outputs=get_outputs())
|
816 |
+
|
817 |
+
# 操作数据表
|
818 |
+
|
819 |
+
# 删除所选列
|
820 |
+
del_col_button.click(fn=del_col, inputs=[del_col_checkboxgroup], outputs=get_outputs())
|
821 |
+
# 保留行
|
822 |
+
remain_row_button.click(fn=remain_row, inputs=[remain_row_slider], outputs=get_outputs())
|
823 |
+
# 删除所有存在缺失值的列
|
824 |
+
del_all_na_col_button.click(fn=del_all_na_col, outputs=get_outputs())
|
825 |
+
# 删除所有重复的行
|
826 |
+
del_duplicate_button.click(fn=del_duplicate, outputs=get_outputs())
|
827 |
+
# 字符型列转数值型列
|
828 |
+
encode_label_button.click(fn=encode_label, inputs=[encode_label_checkboxgroup], outputs=get_outputs())
|
829 |
+
# 将所有数据强制转换为浮点型(除第1列之外)
|
830 |
+
change_data_type_to_float_button.click(fn=change_data_type_to_float, outputs=get_outputs())
|
831 |
+
# 标准化数据
|
832 |
+
standardize_data_button.click(fn=standardize_data, inputs=[standardize_data_checkboxgroup], outputs=get_outputs())
|
833 |
+
# 选择因变量
|
834 |
+
select_as_y_radio.change(fn=select_as_y, inputs=[select_as_y_radio], outputs=get_outputs())
|
835 |
+
# 选择任务类型(强制转换第1列)
|
836 |
+
choose_assign_radio.change(fn=choose_assign, inputs=[choose_assign_radio], outputs=get_outputs())
|
837 |
+
|
838 |
+
# 数据模型
|
839 |
+
select_as_model_radio.change(fn=select_as_model, inputs=[select_as_model_radio], outputs=get_outputs())
|
840 |
+
model_train_button.click(fn=train_model, inputs=[model_optimize_radio, linear_regression_model_radio], outputs=get_outputs())
|
841 |
+
|
842 |
+
# 可视化
|
843 |
+
learning_curve_train_button.click(fn=draw_learning_curve_train_plot, inputs=[learning_curve_checkboxgroup], outputs=get_outputs())
|
844 |
+
learning_curve_validation_button.click(fn=draw_learning_curve_validation_plot, inputs=[learning_curve_checkboxgroup], outputs=get_outputs())
|
845 |
+
shap_beeswarm_button.click(fn=draw_shap_beeswarm_plot, inputs=[shap_beeswarm_radio], outputs=get_outputs())
|
846 |
+
|
847 |
+
if __name__ == "__main__":
|
848 |
+
demo.launch()
|
metrics/__init__.py
ADDED
File without changes
|
metrics/calculate_classification_metrics.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.metrics import *
|
3 |
+
from sklearn.preprocessing import label_binarize
|
4 |
+
|
5 |
+
from visualization.draw_line_graph import draw_line_graph
|
6 |
+
|
7 |
+
|
8 |
+
def calculate_classification_metrics(pred_data, real_data, model_name):
|
9 |
+
info = {}
|
10 |
+
|
11 |
+
real_data = np.round(real_data, 0).astype(int)
|
12 |
+
pred_data = np.round(pred_data, 0).astype(int)
|
13 |
+
|
14 |
+
cur_confusion_matrix = confusion_matrix(real_data[:, 0], pred_data)
|
15 |
+
info["Confusion matrix of "+model_name] = cur_confusion_matrix
|
16 |
+
|
17 |
+
info["Accuracy of "+model_name] = np.sum(cur_confusion_matrix.diagonal()) / np.sum(cur_confusion_matrix)
|
18 |
+
info["Precision of "+model_name] = cur_confusion_matrix.diagonal() / np.sum(cur_confusion_matrix, axis=1)
|
19 |
+
info["Recall of "+model_name] = cur_confusion_matrix.diagonal() / np.sum(cur_confusion_matrix, axis=0)
|
20 |
+
info["F1-score of "+model_name] = np.mean(2 * np.multiply(info["Precision of "+model_name], info["Recall of "+model_name]) / \
|
21 |
+
(info["Precision of "+model_name] + info["Recall of "+model_name]))
|
22 |
+
|
23 |
+
max_class = max(real_data)[0]
|
24 |
+
min_class = min(real_data)[0]
|
25 |
+
pred_data_ = label_binarize(pred_data, classes=range(min_class, max_class+1))
|
26 |
+
real_data_ = label_binarize(real_data, classes=range(min_class, max_class+1))
|
27 |
+
|
28 |
+
for i in range(max_class - min_class):
|
29 |
+
fpr, tpr, thresholds = roc_curve(real_data_[:, i], pred_data_[:, i])
|
30 |
+
# draw_line_graph(fpr, tpr, "ROC curve with AUC={:.2f}".format(auc(fpr, tpr)))
|
31 |
+
|
32 |
+
info["AUC of "+model_name] = roc_auc_score(real_data_, pred_data_)
|
33 |
+
|
34 |
+
return info
|
35 |
+
|
metrics/calculate_regression_metrics.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.metrics import *
|
3 |
+
|
4 |
+
|
5 |
+
def calculate_ar2(real_data, pred_data):
|
6 |
+
model_name = "a"
|
7 |
+
info = {}
|
8 |
+
|
9 |
+
info["MAE of "+model_name] = mean_absolute_error(real_data, pred_data)
|
10 |
+
# mae = mean_absolute_error(real_data, pred_data)
|
11 |
+
info["MSE of "+model_name] = mean_squared_error(real_data, pred_data)
|
12 |
+
# mse = mean_squared_error(real_data, pred_data)
|
13 |
+
info["RSME of "+model_name] = np.sqrt(info["MSE of "+model_name])
|
14 |
+
# rsme = np.sqrt(info["MSE of "+model_name])
|
15 |
+
info["R-Sqaure of "+model_name] = r2_score(real_data, pred_data)
|
16 |
+
# r2 = r2_score(real_data, pred_data)
|
17 |
+
if isinstance(max(real_data), np.ndarray):
|
18 |
+
info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
|
19 |
+
# ar2 = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
|
20 |
+
else:
|
21 |
+
info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
|
22 |
+
# ar2 = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
|
23 |
+
|
24 |
+
return info["Adjusted R-Square of " + model_name]
|
25 |
+
|
26 |
+
|
27 |
+
def calculate_regression_metrics(pred_data, real_data, model_name):
|
28 |
+
info = {}
|
29 |
+
|
30 |
+
info["MAE of "+model_name] = mean_absolute_error(real_data, pred_data)
|
31 |
+
# mae = mean_absolute_error(real_data, pred_data)
|
32 |
+
info["MSE of "+model_name] = mean_squared_error(real_data, pred_data)
|
33 |
+
# mse = mean_squared_error(real_data, pred_data)
|
34 |
+
info["RSME of "+model_name] = np.sqrt(info["MSE of "+model_name])
|
35 |
+
# rsme = np.sqrt(info["MSE of "+model_name])
|
36 |
+
info["R-Sqaure of "+model_name] = r2_score(real_data, pred_data)
|
37 |
+
# r2 = r2_score(real_data, pred_data)
|
38 |
+
if isinstance(max(real_data), np.ndarray):
|
39 |
+
info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
|
40 |
+
# ar2 = 1 - (1 - info["R-Sqaure of "+model_name]) * (len(pred_data)-1) / (len(pred_data)-max(real_data)[0]-1)
|
41 |
+
else:
|
42 |
+
info["Adjusted R-Square of " + model_name] = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
|
43 |
+
# ar2 = 1 - (1 - info["R-Sqaure of " + model_name]) * (len(pred_data) - 1) / (len(pred_data) - max(real_data) - 1)
|
44 |
+
|
45 |
+
return info
|
46 |
+
|
47 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy~=1.23.5
|
2 |
+
pandas~=1.5.3
|
3 |
+
scikit-learn~=1.2.1
|
4 |
+
hmmlearn~=0.3.0
|
5 |
+
matplotlib~=3.7.0
|
6 |
+
scikit-fuzzy~=0.4.2
|
7 |
+
gradio~=4.17.0
|
8 |
+
shap~=0.44.1
|
9 |
+
networkx~=2.8.4
|
10 |
+
scipy~=1.10.0
|
11 |
+
xgboost~=2.0.3
|
12 |
+
tqdm~=4.64.1
|
static/__init__.py
ADDED
File without changes
|
static/col.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_pca_col():
|
2 |
+
return [
|
3 |
+
"p1_momentum_value_better",
|
4 |
+
"elapsed_time",
|
5 |
+
"server",
|
6 |
+
"serve_no",
|
7 |
+
"p1_ace",
|
8 |
+
"p2_ace",
|
9 |
+
"p1_winner",
|
10 |
+
"p2_winner",
|
11 |
+
"winner_shot_type",
|
12 |
+
# "p1_double_fault",
|
13 |
+
"p2_double_fault",
|
14 |
+
"p1_unf_err",
|
15 |
+
"p2_unf_err",
|
16 |
+
"p1_net_pt",
|
17 |
+
"p2_net_pt",
|
18 |
+
"p1_net_pt_won",
|
19 |
+
"p2_net_pt_won",
|
20 |
+
"p1_break_pt",
|
21 |
+
"p2_break_pt",
|
22 |
+
"p1_break_pt_won",
|
23 |
+
"p2_break_pt_won",
|
24 |
+
"p1_break_pt_missed",
|
25 |
+
"p2_break_pt_missed",
|
26 |
+
"p1_distance_run",
|
27 |
+
"p2_distance_run",
|
28 |
+
"rally_count",
|
29 |
+
"speed_mph",
|
30 |
+
"serve_width",
|
31 |
+
"serve_depth",
|
32 |
+
"return_depth"
|
33 |
+
]
|
34 |
+
|
35 |
+
|
36 |
+
def get_momentum_col(p):
|
37 |
+
return [
|
38 |
+
"point_victor",
|
39 |
+
"elapsed_time",
|
40 |
+
"server",
|
41 |
+
"serve_no",
|
42 |
+
"{}_ace".format(p),
|
43 |
+
# "p2_ace",
|
44 |
+
"{}_winner".format(p),
|
45 |
+
# "p2_winner",
|
46 |
+
"winner_shot_type",
|
47 |
+
# "p1_double_fault",
|
48 |
+
# "p2_double_fault",
|
49 |
+
"{}_unf_err".format(p),
|
50 |
+
# "p2_unf_err",
|
51 |
+
"{}_net_pt".format(p),
|
52 |
+
# "p2_net_pt",
|
53 |
+
"{}_net_pt_won".format(p),
|
54 |
+
# "p2_net_pt_won",
|
55 |
+
"{}_break_pt".format(p),
|
56 |
+
# "p2_break_pt",
|
57 |
+
"{}_break_pt_won".format(p),
|
58 |
+
# "p2_break_pt_won",
|
59 |
+
"{}_break_pt_missed".format(p),
|
60 |
+
# "p2_break_pt_missed",
|
61 |
+
"{}_distance_run".format(p),
|
62 |
+
# "p2_distance_run",
|
63 |
+
"rally_count",
|
64 |
+
"speed_mph",
|
65 |
+
"serve_width",
|
66 |
+
"serve_depth",
|
67 |
+
"return_depth"
|
68 |
+
]
|
static/config.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Config:
|
2 |
+
# 随机种子
|
3 |
+
RANDOM_STATE = 123
|
4 |
+
# 绘图颜色组
|
5 |
+
COLORS = [
|
6 |
+
"#8074C8",
|
7 |
+
"#7895C1",
|
8 |
+
"#A8CBDF",
|
9 |
+
"#992224",
|
10 |
+
"#B54764",
|
11 |
+
"#E3625D",
|
12 |
+
"#EF8B67",
|
13 |
+
"#F0C284"
|
14 |
+
]
|
15 |
+
COLORS_1 = [
|
16 |
+
"#91CCC0",
|
17 |
+
"#7FABD1",
|
18 |
+
"#F7AC53",
|
19 |
+
"#EC6E66",
|
20 |
+
"#B5CE4E",
|
21 |
+
"#BD7795",
|
22 |
+
"#B55384",
|
23 |
+
"#474769",
|
24 |
+
"#257D88",
|
25 |
+
"#ED8D5A",
|
26 |
+
"#BFDFD2",
|
27 |
+
"#EFCE87"
|
28 |
+
]
|
29 |
+
|
30 |
+
COLORS_2 = [
|
31 |
+
"#A21A54",
|
32 |
+
"#E7724F",
|
33 |
+
"#32183C"
|
34 |
+
]
|
35 |
+
|
36 |
+
COLORS_3 = [
|
37 |
+
"#ABD1BC",
|
38 |
+
"#CCCC99",
|
39 |
+
"#E3BBED"
|
40 |
+
]
|
41 |
+
|
42 |
+
|
43 |
+
COLORS_4 = [
|
44 |
+
"#CFCFD0",
|
45 |
+
"#B6B3D6",
|
46 |
+
"#F58F7A",
|
47 |
+
"#E9687A",
|
48 |
+
]
|
49 |
+
|
50 |
+
# 预测图展示的点个数
|
51 |
+
DISPLAY_RANGE = 100
|
static/process.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.model_selection import train_test_split
|
3 |
+
from sklearn.model_selection import KFold
|
4 |
+
from sklearn import preprocessing
|
5 |
+
from sklearn.model_selection import GridSearchCV
|
6 |
+
from skopt import BayesSearchCV
|
7 |
+
import copy
|
8 |
+
import pandas as pd
|
9 |
+
from scipy.stats import spearmanr
|
10 |
+
|
11 |
+
from sklearn.datasets import load_iris
|
12 |
+
from sklearn.datasets import load_wine
|
13 |
+
from sklearn.datasets import load_breast_cancer
|
14 |
+
from scipy.linalg import eig
|
15 |
+
|
16 |
+
from static.config import Config
|
17 |
+
|
18 |
+
|
19 |
+
def match_split(df: pd.DataFrame):
|
20 |
+
return df.groupby("match_id")
|
21 |
+
|
22 |
+
|
23 |
+
# 斯皮尔曼秩相关系数
|
24 |
+
def calculate_spearmanr(x, y):
|
25 |
+
rho, p_value = spearmanr(x, y)
|
26 |
+
|
27 |
+
return rho, p_value
|
28 |
+
|
29 |
+
|
30 |
+
def calculate_remain_positive_points(df: pd.DataFrame):
|
31 |
+
# remain_positive距离无限远设置为len(df)
|
32 |
+
|
33 |
+
df["p1_remain_positive"] = 0
|
34 |
+
df["p2_remain_positive"] = 0
|
35 |
+
p1_zero_distance_list = []
|
36 |
+
p2_zero_distance_list = []
|
37 |
+
|
38 |
+
for i in range(1, len(df)):
|
39 |
+
if (df.loc[i, "p1_momentum_value_better"] > 0
|
40 |
+
and i != 0):
|
41 |
+
p1_zero_distance_list.append(i)
|
42 |
+
elif (df.loc[i, "p1_momentum_value_better"] < 0
|
43 |
+
and i != 0):
|
44 |
+
p2_zero_distance_list.append(i)
|
45 |
+
|
46 |
+
for j in range(len(df)):
|
47 |
+
for x in p1_zero_distance_list:
|
48 |
+
if j <= x:
|
49 |
+
df.loc[j, "p1_remain_positive"] = x - j
|
50 |
+
break
|
51 |
+
else:
|
52 |
+
continue
|
53 |
+
|
54 |
+
for j in range(len(df)):
|
55 |
+
for x in p2_zero_distance_list:
|
56 |
+
if j <= x:
|
57 |
+
df.loc[j, "p2_remain_positive"] = x - j
|
58 |
+
break
|
59 |
+
else:
|
60 |
+
continue
|
61 |
+
|
62 |
+
return df
|
63 |
+
|
64 |
+
|
65 |
+
def calculate_swing_point(df:pd.DataFrame):
|
66 |
+
# swing距离无限远设置为len(df)
|
67 |
+
|
68 |
+
df["swing"] = 0
|
69 |
+
zero_distance_list = []
|
70 |
+
|
71 |
+
for i in range(1, len(df)):
|
72 |
+
if (df.loc[i, "p1_momentum_value_better"] > 0 and df.loc[i-1, "p1_momentum_value_better"] < 0
|
73 |
+
and i != 0) or (df.loc[i, "p1_momentum_value_better"] < 0 and df.loc[i - 1, "p1_momentum_value_better"] > 0
|
74 |
+
and i != 0):
|
75 |
+
zero_distance_list.append(i)
|
76 |
+
|
77 |
+
for j in range(len(df)):
|
78 |
+
for x in zero_distance_list:
|
79 |
+
if j <= x:
|
80 |
+
df.loc[j, "swing"] = x - j
|
81 |
+
break
|
82 |
+
else:
|
83 |
+
continue
|
84 |
+
|
85 |
+
return df
|
86 |
+
|
87 |
+
|
88 |
+
def replace_na_to_label(df: pd.DataFrame):
|
89 |
+
return df.fillna("Not A Number")
|
90 |
+
|
91 |
+
|
92 |
+
def get_state_distribution(data):
|
93 |
+
# get the matrix of correlation coefficients
|
94 |
+
covX = np.around(np.corrcoef(data.T), decimals=3)
|
95 |
+
|
96 |
+
# draw_heat_map(covX, "related", False)
|
97 |
+
|
98 |
+
# Solve the eigenvalues and eigenvectors of the coefficient correlation matrix
|
99 |
+
eigenvalues, eigenvectors = np.linalg.eig(covX.T)
|
100 |
+
|
101 |
+
eigenvalues = np.around(eigenvalues, decimals=3)
|
102 |
+
|
103 |
+
eigenvalues_dict = dict(zip(eigenvalues.tolist(), list(range(0, len(eigenvalues)))))
|
104 |
+
|
105 |
+
# Sort feature values in descending order
|
106 |
+
eigenvalues = sorted(eigenvalues, reverse=True)
|
107 |
+
|
108 |
+
for i, value in enumerate(eigenvalues):
|
109 |
+
if i == 0:
|
110 |
+
sorted_eigenvectors = eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)
|
111 |
+
else:
|
112 |
+
sorted_eigenvectors = np.concatenate((sorted_eigenvectors, eigenvectors[:, eigenvalues_dict[value]].reshape(-1, 1)), axis=1)
|
113 |
+
|
114 |
+
# draw_line_graph(range(1, len(eigenvalues) + 1), eigenvalues, "Eigenvalue")
|
115 |
+
|
116 |
+
# get the contribution of the eigenvalues
|
117 |
+
contribution = eigenvalues / np.sum(eigenvalues)
|
118 |
+
|
119 |
+
return contribution
|
120 |
+
|
121 |
+
|
122 |
+
# 指数加权平均
|
123 |
+
def exponential_moving_average(df):
|
124 |
+
alpha = 0.3
|
125 |
+
|
126 |
+
ema = [df[0]]
|
127 |
+
|
128 |
+
for i in range(1, len(df)):
|
129 |
+
ema_value = alpha * df[i] + (1 - alpha) * ema[i-1]
|
130 |
+
ema.append(ema_value)
|
131 |
+
|
132 |
+
return ema
|
133 |
+
|
134 |
+
|
135 |
+
def need_to_mark_in_plot(df, col_name):
|
136 |
+
return df.where(df[col_name] == 1).dropna()
|
137 |
+
|
138 |
+
|
139 |
+
def point_victor_mapping(df):
|
140 |
+
mapping = {
|
141 |
+
1: 0.0,
|
142 |
+
2: 1.0
|
143 |
+
}
|
144 |
+
df["point_victor"] = df["point_victor"].map(mapping)
|
145 |
+
|
146 |
+
return df
|
147 |
+
|
148 |
+
|
149 |
+
def pick_matches_with_name(df, name):
|
150 |
+
df = df.where(df["match_id"] == name).dropna()
|
151 |
+
|
152 |
+
p1_name = df["player1"].iloc[0]
|
153 |
+
p2_name = df["player2"].iloc[0]
|
154 |
+
|
155 |
+
return df, p1_name, p2_name
|
156 |
+
|
157 |
+
|
158 |
+
def pick_matches_with_longest(df):
|
159 |
+
target_match_id = df.groupby("match_id").size().idxmax()
|
160 |
+
|
161 |
+
df = df.where(df["match_id"] == target_match_id).dropna()
|
162 |
+
|
163 |
+
p1_name = df["player1"].iloc[0]
|
164 |
+
p2_name = df["player2"].iloc[0]
|
165 |
+
|
166 |
+
return df, p1_name, p2_name
|
167 |
+
|
168 |
+
|
169 |
+
def choose_y_col_in_dataframe(df: pd.DataFrame, y_col: str):
|
170 |
+
y_data = df[y_col]
|
171 |
+
df.drop(y_col, axis=1, inplace=True)
|
172 |
+
df.insert(0, y_col, y_data)
|
173 |
+
|
174 |
+
return df
|
175 |
+
|
176 |
+
|
177 |
+
def load_data(sort):
|
178 |
+
if sort == "Iris Dataset":
|
179 |
+
sk_data = load_iris()
|
180 |
+
elif sort == "Wine Dataset":
|
181 |
+
sk_data = load_wine()
|
182 |
+
elif sort == "Breast Cancer Dataset":
|
183 |
+
sk_data = load_breast_cancer()
|
184 |
+
|
185 |
+
target_data = sk_data.target.astype(str)
|
186 |
+
for i in range(len(sk_data.target_names)):
|
187 |
+
target_data = np.where(target_data == str(i), sk_data.target_names[i], target_data)
|
188 |
+
|
189 |
+
sk_feature_names = sk_data.feature_names
|
190 |
+
sk_data = np.concatenate((target_data.reshape(-1, 1), sk_data.data), axis=1)
|
191 |
+
sk_feature_names = np.insert(sk_feature_names, 0, "species")
|
192 |
+
|
193 |
+
df = pd.DataFrame(data=sk_data, columns=sk_feature_names)
|
194 |
+
|
195 |
+
return df
|
196 |
+
|
197 |
+
|
198 |
+
def load_custom_data(file):
|
199 |
+
return pd.read_csv(file)
|
200 |
+
|
201 |
+
|
202 |
+
def preprocess_raw_data_filtering(df):
|
203 |
+
info = {}
|
204 |
+
|
205 |
+
len_0 = len(df)
|
206 |
+
info["Total size of raw data"] = len_0
|
207 |
+
|
208 |
+
# Delete the column "CUSTOMER_ID"
|
209 |
+
# df.drop("CUSTOMER_ID", axis=1, inplace=True)
|
210 |
+
|
211 |
+
# Remove duplicate data
|
212 |
+
df.drop_duplicates()
|
213 |
+
len_1 = len_0 - len(df)
|
214 |
+
info["Number of duplicates in the raw data"] = len_1
|
215 |
+
|
216 |
+
# Remove "nan" data
|
217 |
+
# df = remove_nan_from_data(df)
|
218 |
+
# len_2 = len_0 - len_1 - len(df)
|
219 |
+
# info["Number of nan in the raw data"] = len_2
|
220 |
+
|
221 |
+
info["Total size of filtered data after data preprocessing"] = len(df)
|
222 |
+
|
223 |
+
# Save the cleaned data to a csv format file
|
224 |
+
# df.to_csv("../data/filtered_data.csv", index=False)
|
225 |
+
|
226 |
+
return df, info
|
227 |
+
|
228 |
+
|
229 |
+
def remove_nan_from_data(df):
|
230 |
+
# Remove "nan" data
|
231 |
+
df.dropna(inplace=True)
|
232 |
+
|
233 |
+
return df
|
234 |
+
|
235 |
+
|
236 |
+
# Get standardized data
|
237 |
+
def get_standardized_data(df):
|
238 |
+
array = np.concatenate(((df.iloc[:, :1]).values, preprocessing.scale(df.iloc[:, 1:])), axis=1)
|
239 |
+
|
240 |
+
return array
|
241 |
+
|
242 |
+
|
243 |
+
def split_dataset(array):
|
244 |
+
x_train_and_validate, x_test, y_train_and_validate, y_test = train_test_split(
|
245 |
+
array[:, 1:],
|
246 |
+
array[:, :1],
|
247 |
+
random_state=Config.RANDOM_STATE,
|
248 |
+
train_size=0.8
|
249 |
+
)
|
250 |
+
|
251 |
+
return x_train_and_validate, x_test, y_train_and_validate, y_test
|
252 |
+
|
253 |
+
|
254 |
+
def k_fold_cross_validation_data_segmentation(x_train, y_train):
|
255 |
+
k = 5
|
256 |
+
|
257 |
+
train_data_array = np.concatenate((y_train, x_train), axis=1)
|
258 |
+
|
259 |
+
k_fold = KFold(n_splits=k, shuffle=True, random_state=Config.RANDOM_STATE)
|
260 |
+
|
261 |
+
train_data_list = []
|
262 |
+
validate_data_list = []
|
263 |
+
for train_index, validate_index in k_fold.split(train_data_array):
|
264 |
+
train_data_list.append(train_data_array[train_index])
|
265 |
+
validate_data_list.append(train_data_array[validate_index])
|
266 |
+
|
267 |
+
train_and_validate_data_list = []
|
268 |
+
|
269 |
+
for i in range(k):
|
270 |
+
train_and_validate_data_list.append((
|
271 |
+
train_data_list[i][:, 1:],
|
272 |
+
validate_data_list[i][:, 1:],
|
273 |
+
train_data_list[i][:, 0],
|
274 |
+
validate_data_list[i][:, 0]
|
275 |
+
))
|
276 |
+
|
277 |
+
return train_and_validate_data_list
|
278 |
+
|
279 |
+
|
280 |
+
def grid_search(params, model, x_train, y_train, scoring=None):
|
281 |
+
info = {}
|
282 |
+
|
283 |
+
if scoring == "neg_mean_squared_error":
|
284 |
+
grid_search_model = GridSearchCV(model, params, cv=5, scoring="neg_mean_squared_error")
|
285 |
+
else:
|
286 |
+
grid_search_model = GridSearchCV(model, params, cv=5)
|
287 |
+
|
288 |
+
grid_search_model.fit(x_train, y_train.ravel())
|
289 |
+
|
290 |
+
info["Optimal hyperparameters"] = grid_search_model.best_params_
|
291 |
+
|
292 |
+
best_model = grid_search_model.best_estimator_
|
293 |
+
|
294 |
+
return best_model
|
295 |
+
|
296 |
+
|
297 |
+
def bayes_search(params, model, x_train, y_train, scoring=None):
|
298 |
+
info = {}
|
299 |
+
|
300 |
+
if scoring == "neg_mean_squared_error":
|
301 |
+
bayes_search_model = BayesSearchCV(model, params, cv=5, n_iter=50, scoring="neg_mean_squared_error")
|
302 |
+
else:
|
303 |
+
bayes_search_model = BayesSearchCV(model, params, cv=5, n_iter=50)
|
304 |
+
|
305 |
+
bayes_search_model.fit(x_train, y_train)
|
306 |
+
|
307 |
+
info["Optimal hyperparameters"] = bayes_search_model.best_params_
|
308 |
+
|
309 |
+
best_model = bayes_search_model.best_estimator_
|
310 |
+
|
311 |
+
return best_model
|
312 |
+
|
313 |
+
|
visualization/__init__.py
ADDED
File without changes
|
visualization/draw_boxplot.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
|
3 |
+
from coding.llh.static.config import Config
|
4 |
+
|
5 |
+
|
6 |
+
# draw boxplot
|
7 |
+
def draw_boxplot(x_data, title):
|
8 |
+
plt.figure(figsize=(10, 14))
|
9 |
+
plt.grid(True)
|
10 |
+
|
11 |
+
plt.boxplot(
|
12 |
+
x_data,
|
13 |
+
meanline=True,
|
14 |
+
showmeans=True,
|
15 |
+
medianprops={"color": Config.COLORS[0], "linewidth": 1.5},
|
16 |
+
meanprops={"color": Config.COLORS[1], "ls": "--", "linewidth": 1.5},
|
17 |
+
flierprops={"marker": "o", "markerfacecolor": Config.COLORS[2]},
|
18 |
+
labels=x_data.columns.values
|
19 |
+
)
|
20 |
+
|
21 |
+
plt.xticks(rotation=-45)
|
22 |
+
plt.title(title)
|
23 |
+
|
24 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
25 |
+
|
26 |
+
plt.show()
|
visualization/draw_heat_map.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from coding.llh.static.config import Config
|
6 |
+
|
7 |
+
|
8 |
+
# Draw heat map
|
9 |
+
def draw_heat_map(x_data, title, is_rotate, col_name):
|
10 |
+
# col_name = np.delete(col_name, np.where(col_name == "swing"))
|
11 |
+
|
12 |
+
plt.rcParams.update({'figure.autolayout': True})
|
13 |
+
|
14 |
+
plt.figure(figsize=(16, 16))
|
15 |
+
|
16 |
+
if isinstance(x_data, np.ndarray):
|
17 |
+
np_data = np.around(x_data.astype("float64"), 2)
|
18 |
+
pd_data = pd.DataFrame(x_data)
|
19 |
+
elif isinstance(x_data, pd.DataFrame):
|
20 |
+
np_data = np.around(x_data.to_numpy().astype("float64"), 2)
|
21 |
+
pd_data = x_data
|
22 |
+
|
23 |
+
for i in range(np_data.shape[0]):
|
24 |
+
for j in range(np_data.shape[1]):
|
25 |
+
plt.text(j, i, np_data[i, j], ha="center", va="center", color="w")
|
26 |
+
|
27 |
+
if is_rotate:
|
28 |
+
plt.xticks(np.arange(len(pd_data.columns.values)), col_name, rotation=-90)
|
29 |
+
else:
|
30 |
+
plt.xticks(np.arange(len(pd_data.columns.values)), col_name)
|
31 |
+
|
32 |
+
plt.yticks(np.arange(len(pd_data.index.values)), col_name)
|
33 |
+
plt.imshow(np_data)
|
34 |
+
# plt.colorbar(False)
|
35 |
+
plt.tight_layout()
|
36 |
+
# plt.title(title)
|
37 |
+
|
38 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
39 |
+
|
40 |
+
plt.show()
|
visualization/draw_histogram.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
from coding.llh.static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
# Plot bar charts
|
8 |
+
def draw_histogram(x_data, y_data, will_rotate, will_show_text, title):
|
9 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
10 |
+
|
11 |
+
bars = plt.bar(
|
12 |
+
np.arange(0, len(x_data)),
|
13 |
+
x_data,
|
14 |
+
align="center",
|
15 |
+
alpha=1,
|
16 |
+
color=Config.COLORS,
|
17 |
+
tick_label=y_data
|
18 |
+
)
|
19 |
+
|
20 |
+
# Bar annotation
|
21 |
+
if will_show_text:
|
22 |
+
for bar in bars:
|
23 |
+
ax.annotate(
|
24 |
+
str(bar.get_height()),
|
25 |
+
xy=(bar.get_x() + bar.get_width() / 2,
|
26 |
+
bar.get_height()),
|
27 |
+
xytext=(0, 3),
|
28 |
+
textcoords="offset points",
|
29 |
+
va="bottom",
|
30 |
+
ha="center"
|
31 |
+
)
|
32 |
+
|
33 |
+
if will_rotate:
|
34 |
+
plt.xticks(rotation=-90)
|
35 |
+
|
36 |
+
plt.title(title)
|
37 |
+
|
38 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
39 |
+
|
40 |
+
plt.show()
|
visualization/draw_histogram_line_subgraph.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from matplotlib import pyplot as plt
|
3 |
+
|
4 |
+
from coding.llh.static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
def draw_histogram_line_subgraph(total_data_for_plot):
|
8 |
+
# Manually adjust based on the data
|
9 |
+
layout = """
|
10 |
+
ABC
|
11 |
+
DDE
|
12 |
+
FGH
|
13 |
+
IJK
|
14 |
+
"""
|
15 |
+
|
16 |
+
fig, ax = plt.subplot_mosaic(layout, figsize=(16, 16))
|
17 |
+
|
18 |
+
for i, data in enumerate(total_data_for_plot):
|
19 |
+
if data[0] == "line_graph":
|
20 |
+
ax[str(chr(i+65))].grid()
|
21 |
+
ax[str(chr(i+65))].plot(
|
22 |
+
data[1],
|
23 |
+
data[2],
|
24 |
+
"-o",
|
25 |
+
color=Config.COLORS[0],
|
26 |
+
markersize=4
|
27 |
+
)
|
28 |
+
ax[str(chr(i+65))].set_title(data[3])
|
29 |
+
elif data[0] == "histogram":
|
30 |
+
ax[str(chr(i+65))].grid()
|
31 |
+
ax[str(chr(i+65))].bar(
|
32 |
+
np.arange(0, len(data[1])),
|
33 |
+
data[1],
|
34 |
+
align="center",
|
35 |
+
alpha=1,
|
36 |
+
color=Config.COLORS,
|
37 |
+
tick_label=data[2]
|
38 |
+
)
|
39 |
+
|
40 |
+
if data[3]:
|
41 |
+
ax[str(chr(i+65))].tick_params(axis='x', labelrotation=-90)
|
42 |
+
|
43 |
+
ax[str(chr(i+65))].set_title(data[5])
|
44 |
+
|
45 |
+
plt.tight_layout()
|
46 |
+
plt.savefig("./diagram/{}.png".format("total"), dpi=300)
|
47 |
+
|
48 |
+
plt.show()
|
visualization/draw_learning_curve.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from matplotlib import pyplot as plt
|
3 |
+
|
4 |
+
from static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
def draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std):
|
8 |
+
plt.figure(figsize=(10, 6))
|
9 |
+
|
10 |
+
plt.fill_between(
|
11 |
+
train_sizes,
|
12 |
+
train_scores_mean - train_scores_std,
|
13 |
+
train_scores_mean + train_scores_std,
|
14 |
+
alpha=0.1,
|
15 |
+
color=Config.COLORS[0]
|
16 |
+
)
|
17 |
+
plt.plot(
|
18 |
+
train_sizes,
|
19 |
+
train_scores_mean,
|
20 |
+
"o-",
|
21 |
+
color=Config.COLORS[0],
|
22 |
+
label="Training score"
|
23 |
+
)
|
24 |
+
|
25 |
+
plt.fill_between(
|
26 |
+
train_sizes,
|
27 |
+
test_scores_mean - test_scores_std,
|
28 |
+
test_scores_mean + test_scores_std,
|
29 |
+
alpha=0.1,
|
30 |
+
color=Config.COLORS[1]
|
31 |
+
)
|
32 |
+
plt.plot(
|
33 |
+
train_sizes,
|
34 |
+
test_scores_mean,
|
35 |
+
"o-",
|
36 |
+
color=Config.COLORS[1],
|
37 |
+
label="Cross-validation score"
|
38 |
+
)
|
39 |
+
|
40 |
+
plt.title("Learning curve")
|
41 |
+
plt.xlabel("Sizes")
|
42 |
+
plt.ylabel("Accuracy")
|
43 |
+
plt.legend(loc="best")
|
44 |
+
plt.show()
|
visualization/draw_learning_curve_total.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from matplotlib import pyplot as plt
|
3 |
+
|
4 |
+
from static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
def draw_learning_curve_total(input_dict, type):
|
8 |
+
plt.figure(figsize=(10, 6), dpi=300)
|
9 |
+
|
10 |
+
if type == "train":
|
11 |
+
i = 0
|
12 |
+
for label_name, values in input_dict.items():
|
13 |
+
train_sizes = values[0]
|
14 |
+
train_scores_mean = values[1]
|
15 |
+
train_scores_std = values[2]
|
16 |
+
test_scores_mean = values[3]
|
17 |
+
test_scores_std = values[4]
|
18 |
+
|
19 |
+
plt.fill_between(
|
20 |
+
train_sizes,
|
21 |
+
train_scores_mean - train_scores_std,
|
22 |
+
train_scores_mean + train_scores_std,
|
23 |
+
alpha=0.1,
|
24 |
+
color=Config.COLORS[i]
|
25 |
+
)
|
26 |
+
|
27 |
+
plt.plot(
|
28 |
+
train_sizes,
|
29 |
+
train_scores_mean,
|
30 |
+
"o-",
|
31 |
+
color=Config.COLORS[i],
|
32 |
+
label=label_name
|
33 |
+
)
|
34 |
+
|
35 |
+
i += 1
|
36 |
+
|
37 |
+
title = "Training Learning curve"
|
38 |
+
# plt.title(title)
|
39 |
+
|
40 |
+
else:
|
41 |
+
i = 0
|
42 |
+
for label_name, values in input_dict.items():
|
43 |
+
train_sizes = values[0]
|
44 |
+
train_scores_mean = values[1]
|
45 |
+
train_scores_std = values[2]
|
46 |
+
test_scores_mean = values[3]
|
47 |
+
test_scores_std = values[4]
|
48 |
+
|
49 |
+
plt.fill_between(
|
50 |
+
train_sizes,
|
51 |
+
test_scores_mean - test_scores_std,
|
52 |
+
test_scores_mean + test_scores_std,
|
53 |
+
alpha=0.1,
|
54 |
+
color=Config.COLORS[i]
|
55 |
+
)
|
56 |
+
plt.plot(
|
57 |
+
train_sizes,
|
58 |
+
test_scores_mean,
|
59 |
+
"o-",
|
60 |
+
color=Config.COLORS[i],
|
61 |
+
label=label_name
|
62 |
+
)
|
63 |
+
|
64 |
+
i += 1
|
65 |
+
|
66 |
+
title = "Cross-validation Learning curve"
|
67 |
+
# plt.title(title)
|
68 |
+
|
69 |
+
plt.xlabel("Sizes")
|
70 |
+
plt.ylabel("Adjusted R-square")
|
71 |
+
plt.legend()
|
72 |
+
|
73 |
+
# plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
74 |
+
# plt.show()
|
75 |
+
return plt
|
76 |
+
|
visualization/draw_line_graph.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
from static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
# draw line graph
|
8 |
+
def draw_line_graph(x_data, y_data: list, title):
|
9 |
+
plt.figure(figsize=(10, 8))
|
10 |
+
|
11 |
+
plt.plot(
|
12 |
+
x_data,
|
13 |
+
y_data,
|
14 |
+
"-o",
|
15 |
+
color=Config.COLORS[0]
|
16 |
+
)
|
17 |
+
|
18 |
+
plt.title(title)
|
19 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
20 |
+
|
21 |
+
plt.show()
|
22 |
+
|
23 |
+
|
24 |
+
def draw_line_graph_1(x_data, y_data: list, title, labels: list):
|
25 |
+
plt.figure(figsize=(10, 8))
|
26 |
+
|
27 |
+
for i, single_y_data in enumerate(y_data):
|
28 |
+
plt.plot(
|
29 |
+
x_data,
|
30 |
+
single_y_data,
|
31 |
+
"-o",
|
32 |
+
color=Config.COLORS[i],
|
33 |
+
label=labels[i]
|
34 |
+
)
|
35 |
+
|
36 |
+
plt.legend()
|
37 |
+
plt.title(title)
|
38 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
39 |
+
|
40 |
+
plt.show()
|
visualization/draw_momentum.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.metrics import *
|
4 |
+
from sklearn.preprocessing import label_binarize
|
5 |
+
|
6 |
+
from coding.llh.static.config import Config
|
7 |
+
|
8 |
+
|
9 |
+
def draw_momentum(df, p1_name, p2_name):
|
10 |
+
plt.figure(figsize=(10, 6))
|
11 |
+
|
12 |
+
plt.plot(
|
13 |
+
df.loc[:, "elapsed_time"].values,
|
14 |
+
df.loc[:, "p1_momentum_value"].values,
|
15 |
+
"-",
|
16 |
+
color=Config.COLORS_1[8],
|
17 |
+
alpha=0.5,
|
18 |
+
label=p1_name
|
19 |
+
)
|
20 |
+
plt.plot(
|
21 |
+
df.loc[:, "elapsed_time"].values,
|
22 |
+
df.loc[:, "p2_momentum_value"].values,
|
23 |
+
"-",
|
24 |
+
color=Config.COLORS_1[9],
|
25 |
+
alpha=0.5,
|
26 |
+
label=p2_name
|
27 |
+
)
|
28 |
+
plt.axhline(
|
29 |
+
y=0,
|
30 |
+
linestyle="--",
|
31 |
+
color="black",
|
32 |
+
alpha=0.5
|
33 |
+
)
|
34 |
+
plt.plot(
|
35 |
+
df.loc[:, "elapsed_time"].values,
|
36 |
+
df.loc[:, "p1_momentum_value_better"].values,
|
37 |
+
"-",
|
38 |
+
color=Config.COLORS_1[10],
|
39 |
+
alpha=0.7,
|
40 |
+
label="Degree of Superiority"
|
41 |
+
)
|
42 |
+
|
43 |
+
title = "Momentum"
|
44 |
+
# plt.title(title)
|
45 |
+
|
46 |
+
plt.xlabel("Elapsed time")
|
47 |
+
plt.ylabel("Momentum value")
|
48 |
+
plt.legend()
|
49 |
+
|
50 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
51 |
+
|
52 |
+
plt.show()
|
visualization/draw_parallel_coordinates.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
from coding.llh.static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
def draw_parallel_coordinates(df):
|
8 |
+
df.drop("match_id", axis=1, inplace=True)
|
9 |
+
df.drop("player1", axis=1, inplace=True)
|
10 |
+
df.drop("player2", axis=1, inplace=True)
|
11 |
+
df.drop("elapsed_time", axis=1, inplace=True)
|
12 |
+
df.drop("set_no", axis=1, inplace=True)
|
13 |
+
df.drop("game_no", axis=1, inplace=True)
|
14 |
+
df.drop("point_no", axis=1, inplace=True)
|
15 |
+
df.drop("p1_sets", axis=1, inplace=True)
|
16 |
+
df.drop("p2_sets", axis=1, inplace=True)
|
17 |
+
df.drop("p1_games", axis=1, inplace=True)
|
18 |
+
df.drop("p2_games", axis=1, inplace=True)
|
19 |
+
df.drop("p1_points_won", axis=1, inplace=True)
|
20 |
+
df.drop("p2_points_won", axis=1, inplace=True)
|
21 |
+
df.drop("p1_distance_run", axis=1, inplace=True)
|
22 |
+
df.drop("p2_distance_run", axis=1, inplace=True)
|
23 |
+
df.drop("speed_mph", axis=1, inplace=True)
|
24 |
+
df.drop("p1_score_normal", axis=1, inplace=True)
|
25 |
+
df.drop("p2_score_normal", axis=1, inplace=True)
|
26 |
+
df.drop("p1_score_tiebreak", axis=1, inplace=True)
|
27 |
+
df.drop("p2_score_tiebreak", axis=1, inplace=True)
|
28 |
+
df.drop("p1_game_victor", axis=1, inplace=True)
|
29 |
+
df.drop("p2_game_victor", axis=1, inplace=True)
|
30 |
+
df.drop("p1_set_victor", axis=1, inplace=True)
|
31 |
+
df.drop("p2_set_victor", axis=1, inplace=True)
|
32 |
+
|
33 |
+
plt.figure(figsize=(10, 6))
|
34 |
+
|
35 |
+
pd.plotting.parallel_coordinates(df, "point_victor", colormap="viridis")
|
36 |
+
|
37 |
+
title = "Parallel Coordinates Plot"
|
38 |
+
plt.title(title)
|
39 |
+
|
40 |
+
plt.xlabel("Attributes")
|
41 |
+
plt.ylabel("Values")
|
42 |
+
plt.legend()
|
43 |
+
|
44 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
45 |
+
|
46 |
+
plt.show()
|
visualization/draw_play_flow.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.metrics import *
|
4 |
+
from sklearn.preprocessing import label_binarize
|
5 |
+
|
6 |
+
from coding.llh.static.config import Config
|
7 |
+
|
8 |
+
|
9 |
+
def draw_play_flow(df, p1_name, p2_name, p1_ace, p2_ace, p1_net_pt_won, p2_net_pt_won, p1_break_pt_won, p2_break_pt_won):
|
10 |
+
plt.figure(figsize=(10, 6))
|
11 |
+
|
12 |
+
plt.plot(
|
13 |
+
df.loc[:, "elapsed_time"].values,
|
14 |
+
df.loc[:, "p1_points_won"].values,
|
15 |
+
"-",
|
16 |
+
color=Config.COLORS_1[6],
|
17 |
+
alpha=0.5,
|
18 |
+
label=p1_name
|
19 |
+
)
|
20 |
+
plt.plot(
|
21 |
+
df.loc[:, "elapsed_time"].values,
|
22 |
+
df.loc[:, "p2_points_won"].values,
|
23 |
+
"-",
|
24 |
+
color=Config.COLORS_1[7],
|
25 |
+
alpha=0.5,
|
26 |
+
label=p2_name
|
27 |
+
)
|
28 |
+
|
29 |
+
plt.scatter(
|
30 |
+
p1_ace.loc[:, "elapsed_time"].values,
|
31 |
+
p1_ace.loc[:, "p1_points_won"].values,
|
32 |
+
s=40,
|
33 |
+
c=Config.COLORS_1[0],
|
34 |
+
marker="v",
|
35 |
+
label="p1_ace"
|
36 |
+
)
|
37 |
+
plt.scatter(
|
38 |
+
p2_ace.loc[:, "elapsed_time"].values,
|
39 |
+
p2_ace.loc[:, "p2_points_won"].values,
|
40 |
+
s=40,
|
41 |
+
c=Config.COLORS_1[1],
|
42 |
+
marker="v",
|
43 |
+
label="p2_ace"
|
44 |
+
)
|
45 |
+
plt.scatter(
|
46 |
+
p1_net_pt_won.loc[:, "elapsed_time"].values,
|
47 |
+
p1_net_pt_won.loc[:, "p1_points_won"].values,
|
48 |
+
s=40,
|
49 |
+
c=Config.COLORS_1[2],
|
50 |
+
marker="*",
|
51 |
+
label="p1_net_pt_won"
|
52 |
+
)
|
53 |
+
plt.scatter(
|
54 |
+
p2_net_pt_won.loc[:, "elapsed_time"].values,
|
55 |
+
p2_net_pt_won.loc[:, "p2_points_won"].values,
|
56 |
+
s=40,
|
57 |
+
c=Config.COLORS_1[3],
|
58 |
+
marker="*",
|
59 |
+
label="p2_net_pt_won"
|
60 |
+
)
|
61 |
+
plt.scatter(
|
62 |
+
p1_break_pt_won.loc[:, "elapsed_time"].values,
|
63 |
+
p1_break_pt_won.loc[:, "p1_points_won"].values,
|
64 |
+
s=40,
|
65 |
+
c=Config.COLORS_1[4],
|
66 |
+
marker="+",
|
67 |
+
label="p1_break_pt_won"
|
68 |
+
)
|
69 |
+
plt.scatter(
|
70 |
+
p2_break_pt_won.loc[:, "elapsed_time"].values,
|
71 |
+
p2_break_pt_won.loc[:, "p2_points_won"].values,
|
72 |
+
s=40,
|
73 |
+
c=Config.COLORS_1[5],
|
74 |
+
marker="+",
|
75 |
+
label="p1_break_pt_won"
|
76 |
+
)
|
77 |
+
|
78 |
+
title = "Flow of play"
|
79 |
+
# plt.title(title)
|
80 |
+
|
81 |
+
plt.xlabel("Elapsed time")
|
82 |
+
plt.ylabel("Points")
|
83 |
+
plt.legend()
|
84 |
+
|
85 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
86 |
+
|
87 |
+
plt.show()
|
visualization/draw_pred_total.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from matplotlib import pyplot as plt
|
3 |
+
|
4 |
+
from coding.llh.static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
def draw_pred_total(input_dict):
|
8 |
+
plt.figure(figsize=(10, 6))
|
9 |
+
|
10 |
+
i = 0
|
11 |
+
for name, cur_list in input_dict.items():
|
12 |
+
mylist = cur_list
|
13 |
+
plt.plot(
|
14 |
+
np.array([x for x in range(len(cur_list[0]))]),
|
15 |
+
cur_list[0],
|
16 |
+
"-",
|
17 |
+
color=Config.COLORS_4[i],
|
18 |
+
alpha=0.9,
|
19 |
+
label=name
|
20 |
+
)
|
21 |
+
i += 1
|
22 |
+
|
23 |
+
plt.plot(
|
24 |
+
np.array([x for x in range(len(mylist[1]))]),
|
25 |
+
mylist[1],
|
26 |
+
"--",
|
27 |
+
color=Config.COLORS_4[1],
|
28 |
+
alpha=0.9,
|
29 |
+
label="actual data"
|
30 |
+
)
|
31 |
+
|
32 |
+
title = "pred curve"
|
33 |
+
|
34 |
+
plt.xlabel("Sizes")
|
35 |
+
plt.ylabel("Value")
|
36 |
+
plt.legend()
|
37 |
+
|
38 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
39 |
+
|
40 |
+
plt.show()
|
41 |
+
|
42 |
+
|
visualization/draw_roc_auc_curve_total.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.metrics import *
|
4 |
+
from sklearn.preprocessing import label_binarize
|
5 |
+
|
6 |
+
from coding.llh.static.config import Config
|
7 |
+
|
8 |
+
|
9 |
+
def draw_roc_auc_curve_total(input_dict, type):
|
10 |
+
plt.figure(figsize=(10, 6))
|
11 |
+
|
12 |
+
if type == "train":
|
13 |
+
i = 0
|
14 |
+
for label_name, values in input_dict.items():
|
15 |
+
fpr = values[0]
|
16 |
+
tpr = values[1]
|
17 |
+
thresholds = values[2]
|
18 |
+
|
19 |
+
plt.plot(
|
20 |
+
fpr,
|
21 |
+
tpr,
|
22 |
+
"o-",
|
23 |
+
color=Config.COLORS[i],
|
24 |
+
label=label_name+str(round(auc(fpr, tpr), 2))
|
25 |
+
)
|
26 |
+
|
27 |
+
i += 1
|
28 |
+
|
29 |
+
title = "Training roc-auc curve"
|
30 |
+
plt.title(title)
|
31 |
+
|
32 |
+
else:
|
33 |
+
i = 0
|
34 |
+
for label_name, values in input_dict.items():
|
35 |
+
fpr = values[0]
|
36 |
+
tpr = values[1]
|
37 |
+
thresholds = values[2]
|
38 |
+
|
39 |
+
plt.plot(
|
40 |
+
fpr,
|
41 |
+
tpr,
|
42 |
+
"o-",
|
43 |
+
color=Config.COLORS[i],
|
44 |
+
label=label_name + str(round(auc(fpr, tpr), 2))
|
45 |
+
)
|
46 |
+
|
47 |
+
i += 1
|
48 |
+
|
49 |
+
title = "Cross-validation roc-auc curve"
|
50 |
+
plt.title(title)
|
51 |
+
|
52 |
+
plt.xlabel("tpr")
|
53 |
+
plt.ylabel("fpr")
|
54 |
+
plt.legend()
|
55 |
+
|
56 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
57 |
+
|
58 |
+
plt.show()
|
visualization/draw_scatter.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from mpl_toolkits.mplot3d import Axes3D
|
4 |
+
|
5 |
+
from coding.llh.static.config import Config
|
6 |
+
|
7 |
+
|
8 |
+
# Draw scatter
|
9 |
+
def draw_scatter_2D(x_data, y_data, centers, title):
|
10 |
+
num_clusters = np.unique(y_data)
|
11 |
+
|
12 |
+
plt.figure(figsize=(10, 8))
|
13 |
+
|
14 |
+
for i in range(len(num_clusters)):
|
15 |
+
plt.scatter(x_data[y_data == i][:, 0], x_data[y_data == i][:, 1], s=1)
|
16 |
+
for i in range(len(num_clusters)):
|
17 |
+
plt.scatter(centers[i, 0], centers[i, 1], marker="*", s=50, c="black")
|
18 |
+
|
19 |
+
plt.title(title)
|
20 |
+
|
21 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
22 |
+
|
23 |
+
plt.show()
|
24 |
+
|
25 |
+
|
26 |
+
def draw_scatter_2D_1(x_data, title):
|
27 |
+
plt.figure(figsize=(10, 8))
|
28 |
+
|
29 |
+
plt.scatter(x_data[:, 0], x_data[:, 1], s=1)
|
30 |
+
|
31 |
+
plt.title(title)
|
32 |
+
|
33 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
34 |
+
|
35 |
+
plt.show()
|
36 |
+
|
37 |
+
|
38 |
+
def draw_scatter_3D(x_data, y_data, centers, title):
|
39 |
+
num_clusters = np.unique(y_data)
|
40 |
+
|
41 |
+
fig = plt.figure(figsize=(10, 8))
|
42 |
+
|
43 |
+
ax = Axes3D(fig)
|
44 |
+
fig.add_axes(ax)
|
45 |
+
|
46 |
+
for i in range(len(num_clusters)):
|
47 |
+
ax.scatter(x_data[y_data == i][:, 0], x_data[y_data == i][:, 1], x_data[y_data == i][:, 2], s=1)
|
48 |
+
for i in range(len(num_clusters)):
|
49 |
+
ax.scatter(centers[i, 0], centers[i, 1], centers[i, 2], marker="*", s=50, c="black")
|
50 |
+
|
51 |
+
plt.title(title)
|
52 |
+
|
53 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
54 |
+
|
55 |
+
plt.show()
|
56 |
+
|
57 |
+
|
58 |
+
def draw_scatter_3D_1(x_data, title):
|
59 |
+
fig = plt.figure(figsize=(10, 8))
|
60 |
+
|
61 |
+
ax = Axes3D(fig)
|
62 |
+
fig.add_axes(ax)
|
63 |
+
|
64 |
+
ax.scatter(x_data[:, 0], x_data[:, 1], x_data[:, 2], s=1)
|
65 |
+
|
66 |
+
plt.title(title)
|
67 |
+
|
68 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
69 |
+
|
70 |
+
plt.show()
|
visualization/draw_scatter_line_graph.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
from coding.llh.static.config import Config
|
5 |
+
|
6 |
+
|
7 |
+
# draw scatter line graph
|
8 |
+
def draw_scatter_line_graph(x_data, y_pred_data, y_real_data, coef, intercept, labels, title):
|
9 |
+
# Manually adjust based on the data
|
10 |
+
layout = """
|
11 |
+
ABCDE
|
12 |
+
FGHIJ
|
13 |
+
"""
|
14 |
+
|
15 |
+
fig, ax = plt.subplot_mosaic(layout, figsize=(16, 16))
|
16 |
+
|
17 |
+
for i in range(np.size(x_data, 1)):
|
18 |
+
ax[str(chr(i+65))].scatter(x_data[:, i], y_pred_data.T, color=Config.COLORS[0], s=4, label=labels[0])
|
19 |
+
ax[str(chr(i+65))].scatter(x_data[:, i], y_real_data, color=Config.COLORS[1], s=4, label=labels[1])
|
20 |
+
ax[str(chr(i+65))].plot(x_data[:, i], x_data[:, i] * coef[i] + intercept, color=Config.COLORS[2], markersize=4)
|
21 |
+
ax[str(chr(i + 65))].legend()
|
22 |
+
|
23 |
+
plt.suptitle(title)
|
24 |
+
|
25 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
26 |
+
|
27 |
+
plt.show()
|
visualization/draw_swings_and_positives.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.metrics import *
|
4 |
+
from sklearn.preprocessing import label_binarize
|
5 |
+
|
6 |
+
from coding.llh.static.config import Config
|
7 |
+
|
8 |
+
|
9 |
+
def draw_swings_and_positives(df, p1_name, p2_name):
|
10 |
+
plt.figure(figsize=(10, 6))
|
11 |
+
|
12 |
+
plt.plot(
|
13 |
+
df.loc[:, "elapsed_time"].values,
|
14 |
+
df.loc[:, "swing"].values,
|
15 |
+
"-",
|
16 |
+
color=Config.COLORS_2[2],
|
17 |
+
alpha=0.7,
|
18 |
+
label="Swing of Play"
|
19 |
+
)
|
20 |
+
plt.plot(
|
21 |
+
df.loc[:, "elapsed_time"].values,
|
22 |
+
df.loc[:, "p1_remain_positive"].values,
|
23 |
+
"-.",
|
24 |
+
color=Config.COLORS_2[0],
|
25 |
+
alpha=0.7,
|
26 |
+
label=p1_name
|
27 |
+
)
|
28 |
+
plt.plot(
|
29 |
+
df.loc[:, "elapsed_time"].values,
|
30 |
+
df.loc[:, "p2_remain_positive"].values,
|
31 |
+
"-.",
|
32 |
+
color=Config.COLORS_2[1],
|
33 |
+
alpha=0.7,
|
34 |
+
label=p2_name
|
35 |
+
)
|
36 |
+
|
37 |
+
title = "Standard time interval"
|
38 |
+
# plt.title(title)
|
39 |
+
|
40 |
+
plt.xlabel("Elapsed time")
|
41 |
+
plt.ylabel("Standard time interval")
|
42 |
+
plt.legend()
|
43 |
+
|
44 |
+
plt.savefig("./diagram/{}.png".format(title), dpi=300)
|
45 |
+
|
46 |
+
plt.show()
|