mmgp_tensile2d / train.py
fabiencasenave's picture
Upload folder using huggingface_hub
eb85dea verified
raw
history blame
3.87 kB
import os
os.environ["OMP_PROC_BIND"] = "spread"
os.environ["OMP_PLACES"] = "threads"
from pathlib import Path
import yaml
import numpy as np
import joblib
from datasets.utils.logging import disable_progress_bar
from datasets import load_dataset
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold, GridSearchCV
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor
from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer
from mmgp.pipelines.mmgp_blocks import MMGPPreparer, MMGPTransformer
from mmgp_tensile2d.utils import length_scale_init, morphing
n_processes = min(max(1, os.cpu_count()), 24)
# load dataset
hf_dataset = load_dataset("PLAID-datasets/Tensile2d", split="all_samples")
ids_train = hf_dataset.description["split"]['train_500']
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ids_train, processes_number = n_processes, verbose = True)
# load pipeline configuration and trim dataset
with open("config_pipeline.yml", 'r') as f:
config = yaml.safe_load(f)
all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\
config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers']
dataset_train = dataset_train.from_features_identifier(all_feature_id)
# define learning pipeline
preparator = MMGPPreparer(common_mesh_id = 1, morphing = morphing)
input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])
nodes_preprocessor = Pipeline(
steps=[
("mmgp_nodes_transf", MMGPTransformer(**config['mmgp_nodes_transf'])),
('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])),
]
)
column_preprocessor = PlaidColumnTransformer(
[
('input_scalar_scaler', input_scalar_scaler),
('nodes_preprocessor', nodes_preprocessor),
]
)
preprocessor = Pipeline(
steps=[
("preparator", preparator),
('column_preprocessor', column_preprocessor),
]
)
kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5)
gpr = GaussianProcessRegressor(
kernel=kernel,
optimizer='fmin_l_bfgs_b',
n_restarts_optimizer=2,
random_state=42)
reg = MultiOutputRegressor(gpr)
dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init}
regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory)
postprocessor = Pipeline(
steps=[
("mmgp_u1_transf", MMGPTransformer(**config['mmgp_u1_transf'])),
('pca_u1', WrappedPlaidSklearnTransformer(PCA(), **config['pca_u1'])),
]
)
target_regressor = PlaidTransformedTargetRegressor(
regressor=regressor,
transformer=postprocessor,
)
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", target_regressor),
]
)
# Set hyperameter that have been optimized by cross-valdiation on the training set
optimized_pipeline = clone(pipeline).set_params(
preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = 16,
regressor__transformer__pca_u1__sklearn_block__n_components = 32
)
# Train the model
optimized_pipeline.fit(dataset_train)
# Save model
joblib.dump(optimized_pipeline, "pipeline.joblib")