import os import yaml import joblib from datasets import load_dataset from sklearn.base import clone from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel from sklearn.multioutput import MultiOutputRegressor from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer from pca_gp_vkils59.utils import length_scale_init n_processes = min(max(1, os.cpu_count()), 24) # load dataset hf_dataset = load_dataset("PLAID-datasets/VKI-LS59", split="all_samples") ids_train = hf_dataset.description["split"]['train'] dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = ids_train, processes_number = n_processes, verbose = True) # load pipeline configuration and trim dataset with open("config_pipeline.yml", 'r') as f: config = yaml.safe_load(f) all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\ config['pca_nodes']['in_features_identifiers'] + config['pca_mach']['in_features_identifiers'] dataset_train = dataset_train.from_features_identifier(all_feature_id) # define learning pipeline preprocessor = PlaidColumnTransformer( [ ('input_scalar_scaler', WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])), ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])), ] ) postprocessor = WrappedPlaidSklearnTransformer(PCA(), **config['pca_mach']) kernel = ConstantKernel() * Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5) + WhiteKernel(noise_level_bounds=(1e-8, 1e8)) gpr = GaussianProcessRegressor( kernel=kernel, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=1, random_state=42) reg = MultiOutputRegressor(gpr) dynamics_params_factory = {'estimator__kernel__k1__k2__length_scale':length_scale_init} regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory) target_regressor = PlaidTransformedTargetRegressor( regressor=regressor, transformer=postprocessor ) target_regressor pipeline = Pipeline( steps=[ ("preprocessor", preprocessor), ("regressor", target_regressor), ] ) # Set hyperameter that have been optimized by cross-valdiation on the training set optimized_pipeline = clone(pipeline).set_params( preprocessor__pca_nodes__sklearn_block__n_components = 3, regressor__transformer__sklearn_block__n_components = 4 ) # Train the model optimized_pipeline.fit(dataset_train) # Save model joblib.dump(optimized_pipeline, "pipeline.joblib")