|
import os |
|
os.environ["OMP_PROC_BIND"] = "spread" |
|
os.environ["OMP_PLACES"] = "threads" |
|
|
|
from pathlib import Path |
|
|
|
import yaml |
|
import numpy as np |
|
import joblib |
|
|
|
from datasets.utils.logging import disable_progress_bar |
|
from datasets import load_dataset |
|
|
|
from sklearn.base import clone |
|
from sklearn.pipeline import Pipeline |
|
|
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import MinMaxScaler |
|
from sklearn.gaussian_process import GaussianProcessRegressor |
|
from sklearn.gaussian_process.kernels import Matern |
|
from sklearn.multioutput import MultiOutputRegressor |
|
|
|
from sklearn.model_selection import KFold, GridSearchCV |
|
|
|
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition |
|
from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor |
|
from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer |
|
from mmgp.pipelines.mmgp_blocks import MMGPPreparer, MMGPTransformer |
|
|
|
from mmgp_tensile2d.utils import length_scale_init, morphing |
|
|
|
|
|
n_processes = min(max(1, os.cpu_count()), 24) |
|
|
|
|
|
|
|
hf_dataset = load_dataset("PLAID-datasets/Tensile2d", split="all_samples") |
|
ids_train = hf_dataset.description["split"]['train_500'] |
|
|
|
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ids_train, processes_number = n_processes, verbose = True) |
|
|
|
|
|
|
|
with open("config_pipeline.yml", 'r') as f: |
|
config = yaml.safe_load(f) |
|
|
|
all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\ |
|
config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers'] |
|
|
|
dataset_train = dataset_train.from_features_identifier(all_feature_id) |
|
|
|
|
|
|
|
preparator = MMGPPreparer(common_mesh_id = 1, morphing = morphing) |
|
|
|
input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler']) |
|
|
|
nodes_preprocessor = Pipeline( |
|
steps=[ |
|
("mmgp_nodes_transf", MMGPTransformer(**config['mmgp_nodes_transf'])), |
|
('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])), |
|
] |
|
) |
|
|
|
column_preprocessor = PlaidColumnTransformer( |
|
[ |
|
('input_scalar_scaler', input_scalar_scaler), |
|
('nodes_preprocessor', nodes_preprocessor), |
|
] |
|
) |
|
|
|
preprocessor = Pipeline( |
|
steps=[ |
|
("preparator", preparator), |
|
('column_preprocessor', column_preprocessor), |
|
] |
|
) |
|
|
|
|
|
kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5) |
|
|
|
gpr = GaussianProcessRegressor( |
|
kernel=kernel, |
|
optimizer='fmin_l_bfgs_b', |
|
n_restarts_optimizer=2, |
|
random_state=42) |
|
|
|
reg = MultiOutputRegressor(gpr) |
|
|
|
|
|
dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init} |
|
|
|
regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory) |
|
|
|
postprocessor = Pipeline( |
|
steps=[ |
|
("mmgp_u1_transf", MMGPTransformer(**config['mmgp_u1_transf'])), |
|
('pca_u1', WrappedPlaidSklearnTransformer(PCA(), **config['pca_u1'])), |
|
] |
|
) |
|
|
|
|
|
target_regressor = PlaidTransformedTargetRegressor( |
|
regressor=regressor, |
|
transformer=postprocessor, |
|
) |
|
|
|
pipeline = Pipeline( |
|
steps=[ |
|
("preprocessor", preprocessor), |
|
("regressor", target_regressor), |
|
] |
|
) |
|
|
|
|
|
|
|
|
|
optimized_pipeline = clone(pipeline).set_params( |
|
preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = 16, |
|
regressor__transformer__pca_u1__sklearn_block__n_components = 32 |
|
) |
|
|
|
|
|
optimized_pipeline.fit(dataset_train) |
|
|
|
|
|
joblib.dump(optimized_pipeline, "pipeline.joblib") |