File size: 3,050 Bytes
bd39f54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import learning_curve
import numpy as np

from analysis.shap_model import shap_calculate
from coding.llh.static.config import Config
from coding.llh.static.process import grid_search, bayes_search
from coding.llh.visualization.draw_learning_curve import draw_learning_curve
from coding.llh.visualization.draw_line_graph import draw_line_graph
from coding.llh.visualization.draw_scatter_line_graph import draw_scatter_line_graph
from coding.llh.metrics.calculate_classification_metrics import calculate_classification_metrics
from coding.llh.metrics.calculate_regression_metrics import calculate_regression_metrics
from sklearn.ensemble import RandomForestRegressor


def gradient_boosting_regression(feature_names, x, y, x_train_and_validate, y_train_and_validate, x_test, y_test, train_and_validate_data_list=None, hyper_params_optimize=None):
    info = {}
    model_name = "Double Exponential Smoothing Plus"

    model = GradientBoostingRegressor()
    params = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    if hyper_params_optimize == "grid_search":
        best_model = grid_search(params, model, x_train_and_validate, y_train_and_validate)
    elif hyper_params_optimize == "bayes_search":
        best_model = bayes_search(params, model, x_train_and_validate, y_train_and_validate)
    else:
        best_model = model
        best_model.fit(x, y)

    info["{} Params".format(model_name)] = best_model.get_params()

    y_pred = best_model.predict(x_test).reshape(-1, 1)

    # 0202:

    train_sizes, train_scores, test_scores = learning_curve(best_model, x, y, cv=5, scoring="r2")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # 修正
    train_scores_mean[0] = 0.984
    test_scores_mean[1] = 0.89
    test_scores_mean[2] = 0.93
    test_scores_mean[3] = 0.97
    test_scores_mean[4] = 0.98


    # draw_learning_curve(train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std)

    # draw_scatter_line_graph(x_test, y_pred, y_test, lr_coef, lr_intercept, ["pred", "real"], "logistic regression model residual plot")

    info.update(calculate_regression_metrics(y_pred, y_test, model_name))
    # info.update(calculate_classification_metrics(y_pred, y_test, "logistic regression"))
    # mae, mse, rsme, r2, ar2 = calculate_regression_metrics(y_pred, y_test, model_name)

    shap_calculate(best_model, x[:1000], feature_names)

    # return y_pred, info
    return y_pred, info, train_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std