File size: 3,865 Bytes
0f859ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb85dea
0f859ab
 
 
 
eb85dea
0f859ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb85dea
0f859ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb85dea
0f859ab
 
 
 
 
 
 
 
 
 
 
 
eb85dea
0f859ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb85dea
 
0f859ab
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
os.environ["OMP_PROC_BIND"] = "spread"
os.environ["OMP_PLACES"]    = "threads"

from pathlib import Path

import yaml
import numpy as np
import joblib

from datasets.utils.logging import disable_progress_bar
from datasets import load_dataset

from sklearn.base import clone
from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.multioutput import MultiOutputRegressor

from sklearn.model_selection import KFold, GridSearchCV

from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor
from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer
from mmgp.pipelines.mmgp_blocks import MMGPPreparer, MMGPTransformer

from mmgp_tensile2d.utils import length_scale_init, morphing


n_processes = min(max(1, os.cpu_count()), 24)


# load dataset
hf_dataset = load_dataset("PLAID-datasets/Tensile2d", split="all_samples")
ids_train = hf_dataset.description["split"]['train_500']

dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ids_train, processes_number = n_processes, verbose = True)


# load pipeline configuration and trim dataset
with open("config_pipeline.yml", 'r') as f:
    config = yaml.safe_load(f)

all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\
    config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers']

dataset_train = dataset_train.from_features_identifier(all_feature_id)


# define learning pipeline
preparator = MMGPPreparer(common_mesh_id = 1, morphing = morphing)

input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])

nodes_preprocessor = Pipeline(
    steps=[
        ("mmgp_nodes_transf", MMGPTransformer(**config['mmgp_nodes_transf'])),
        ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])),
    ]
)

column_preprocessor = PlaidColumnTransformer(
                [
                    ('input_scalar_scaler', input_scalar_scaler),
                    ('nodes_preprocessor', nodes_preprocessor),
                ]
            )

preprocessor = Pipeline(
    steps=[
        ("preparator", preparator),
        ('column_preprocessor', column_preprocessor),
    ]
)


kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5)

gpr = GaussianProcessRegressor(
    kernel=kernel,
    optimizer='fmin_l_bfgs_b',
    n_restarts_optimizer=2,
    random_state=42)

reg = MultiOutputRegressor(gpr)


dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init}

regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory)

postprocessor = Pipeline(
    steps=[
        ("mmgp_u1_transf", MMGPTransformer(**config['mmgp_u1_transf'])),
        ('pca_u1', WrappedPlaidSklearnTransformer(PCA(), **config['pca_u1'])),
    ]
)


target_regressor = PlaidTransformedTargetRegressor(
    regressor=regressor,
    transformer=postprocessor,
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", target_regressor),
    ]
)



# Set hyperameter that have been optimized by cross-valdiation on the training set
optimized_pipeline = clone(pipeline).set_params(
    preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = 16,
    regressor__transformer__pca_u1__sklearn_block__n_components = 32
)

# Train the model
optimized_pipeline.fit(dataset_train)

# Save model
joblib.dump(optimized_pipeline, "pipeline.joblib")