import os os.environ["OMP_PROC_BIND"] = "spread" os.environ["OMP_PLACES"] = "threads" from pathlib import Path import yaml import numpy as np import joblib from datasets.utils.logging import disable_progress_bar from datasets import load_dataset from sklearn.base import clone from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern from sklearn.multioutput import MultiOutputRegressor from sklearn.model_selection import KFold, GridSearchCV from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer from mmgp.pipelines.mmgp_blocks import MMGPPreparer, MMGPTransformer from mmgp_tensile2d.utils import length_scale_init, morphing n_processes = min(max(1, os.cpu_count()), 24) # load dataset hf_dataset = load_dataset("PLAID-datasets/Tensile2d", split="all_samples") ids_train = hf_dataset.description["split"]['train_500'] dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ids_train, processes_number = n_processes, verbose = True) # load pipeline configuration and trim dataset with open("config_pipeline.yml", 'r') as f: config = yaml.safe_load(f) all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\ config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers'] dataset_train = dataset_train.from_features_identifier(all_feature_id) # define learning pipeline preparator = MMGPPreparer(common_mesh_id = 1, morphing = morphing) input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler']) nodes_preprocessor = Pipeline( steps=[ ("mmgp_nodes_transf", MMGPTransformer(**config['mmgp_nodes_transf'])), ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])), ] ) column_preprocessor = PlaidColumnTransformer( [ ('input_scalar_scaler', input_scalar_scaler), ('nodes_preprocessor', nodes_preprocessor), ] ) preprocessor = Pipeline( steps=[ ("preparator", preparator), ('column_preprocessor', column_preprocessor), ] ) kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5) gpr = GaussianProcessRegressor( kernel=kernel, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=2, random_state=42) reg = MultiOutputRegressor(gpr) dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init} regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory) postprocessor = Pipeline( steps=[ ("mmgp_u1_transf", MMGPTransformer(**config['mmgp_u1_transf'])), ('pca_u1', WrappedPlaidSklearnTransformer(PCA(), **config['pca_u1'])), ] ) target_regressor = PlaidTransformedTargetRegressor( regressor=regressor, transformer=postprocessor, ) pipeline = Pipeline( steps=[ ("preprocessor", preprocessor), ("regressor", target_regressor), ] ) # Set hyperameter that have been optimized by cross-valdiation on the training set optimized_pipeline = clone(pipeline).set_params( preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = 16, regressor__transformer__pca_u1__sklearn_block__n_components = 32 ) # Train the model optimized_pipeline.fit(dataset_train) # Save model joblib.dump(optimized_pipeline, "pipeline.joblib")