Source code for hosa.optimization.hosa

"""
Utilities for implementing the HOSA.
"""
import abc

import numpy as np
from sklearn.model_selection import ShuffleSplit
from tqdm import tqdm

from hosa.helpers.functions import create_parameter_grid, create_overlapping, \
    prepare_param_overlapping


[docs]class BaseHOSA: """ Heuristic Oriented Search Algorithm (HOSA) This class implments the HOSA. Following a heuristic search, the algorithm finetunes the most relevant models' parameters. Thus, HOSA avoids testing every possible combination, and therefore, an exhaustive search. .. warning:: This class should not be used directly. Use derived classes instead, i.e., :class:`.HOSACNN` or :class:`.HOSARNN`. Args: x (numpy.ndarray): Input data. y (numpy.ndarray): Target values (class labels in classification, real numbers in regression). model (object): Class of the object to be optimized. Available options are: :class:`.RNNClassification`, :class:`.RNNRegression`, :class:`.CNNClassification` and :class:`.CNNRegression`. n_outputs (int): Number of class labels in classification, or the number of numerical values to predict in regression. parameters (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values. tr (float): Minimum threshold of improvement of the performance metric. apply_rsv (bool): ``True`` if random sub-sampling validation should be used during the optimization procedure. validation_size (float): Proportion of the dataset to include in the validation split on the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. n_splits (int): Number of splits used in the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. """ def __init__(self, x, y, model, n_outputs, parameters, tr, apply_rsv=True, validation_size=.25, n_splits=10): self.x = x self.y = y self.model = model self.n_outputs = n_outputs self.parameters = [parameters] self.tr = tr self.apply_rsv = apply_rsv self.validation_size = validation_size self.n_splits = n_splits # Check the type of the model self.is_cnn = 'CNNRegression' in str(self.model) or 'CNNClassification' in str(self.model) self.is_rnn = 'RNNRegression' in str(self.model) or 'RNNClassification' in str(self.model) # Check if the parameters are valid self.__check_params() # Intialize variables self.best_model = self.best_metric = self.best_specification = None # According to the model, initialize the metrics and compare function if 'Regression' in str(model): self.initial_metric_value = np.inf self.compare_function = np.less self.stop_check = self.__stop_check_decrease elif 'Classification' in str(model): self.initial_metric_value = -np.inf self.compare_function = np.greater self.stop_check = self.__stop_check_increase else: raise ValueError('The type of the model is invalid.') def __stop_check_increase(self, best_metric_current, best_metric_prev): """ Checks if the stopping criterion is met for maximization problems (e.g., classification problems). Args: best_metric_current (float): Current best metric found. best_metric_prev (float): Previous best metric found. Returns: bool: Returns ``True`` if the HOSA procedure should be stopped. ``False`` otherwise. """ return best_metric_current - best_metric_prev <= self.tr def __stop_check_decrease(self, best_metric_current, best_metric_prev): """ Checks if the stopping criterion is met for minimization problems (e.g., regression problems). Args: best_metric_current (float): Current best metric found. best_metric_prev (float): Previous best metric found. Returns: bool: Returns ``True`` if the HOSA procedure should be stopped. ``False`` otherwise. """ return best_metric_prev - best_metric_current <= self.tr def __fit_assess_model(self, model, x_win, y_win, imbalance_correction=None, **kwargs): """ Fits the model and computes the chosen performance metric of the target values based on the inputs. Args: model (tensorflow.keras.Sequential): Model to be fitted and assessed. x_win (numpy.ndarray): Windowed input data. y_win (numpy.ndarray): Windowed target values (class labels in classification, real numbers in regression). imbalance_correction (None or bool): Whether to apply correction to class imbalances. **Only used for classification problems. Ignored for regression.** **kwargs: Extra arguments explicitly used for regression or classification models, including the additional arguments that are used in the TensorFlow's model ``fit`` function. See `here <https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit>`_. Returns: float: Returns the value of the chosen performance metric on the fitted model. """ # If we need to use random sub-sampling validation if self.apply_rsv: # Generate the splits rs = ShuffleSplit(n_splits=self.n_splits, test_size=self.validation_size) metrics = np.empty(self.n_splits) i = 0 for train_index, validation_index in rs.split(x_win): x_win_train, y_win_train = x_win[train_index], y_win[train_index] x_win_validation, y_win_validation = x_win[validation_index], y_win[ validation_index] model.fit(x_win_train, y_win_train, **kwargs) # Collect metrics about the model metric, *_ = model.score(x_win_validation, y_win_validation, imbalance_correction=imbalance_correction) metrics[i] = metric i = i + 1 return np.mean(metrics) else: model.fit(x_win, y_win, **kwargs) # Collect metrics about the model metric, *_ = model.score(x_win, y_win, imbalance_correction=imbalance_correction) return metric def __check_params(self): """ According to the type of model, checks if the user has specified all the mandatory parameters. Raises: ValueError: If the user did not set some mandatory parameter. """ # Note: `required_parameters` must be defined in each child-class for parameter in self.required_parameters: if parameter not in self.parameters[0]: raise ValueError( 'The parameter `' + parameter + '` must be specified in the `parameters` ' 'dictionary.')
[docs] def get_params(self): """ Get parameters for the best model found. Returns: dict: Parameter names mapped to their values. """ return self.best_specification
[docs] def get_model(self): """ Get the best model found. Returns: tensorflow.keras.Sequential: Returns the best TensorFlow model found. """ return self.best_model.model
[docs] def predict(self, x, **kwargs): """ Predicts the target values using the input data in the best model found. Args: x (numpy.ndarray): Input data. **kwargs: Extra arguments that are used in the TensorFlow's model ``predict`` function. See `here <https://www.tensorflow.org/api_docs/python/tf/keras/Model #predict>`_. Returns: numpy.ndarray: Returns an array containing the estimates that were obtained on the best-fitted model found. """ overlapping_type, overlapping_epochs, stride, timesteps = prepare_param_overlapping( self.get_params()) x, _ = create_overlapping(x, None, self.best_model, overlapping_epochs, overlapping_type, n_stride=stride, n_timesteps=timesteps) return self.best_model.predict(x, **kwargs)
[docs] def score(self, x, y, **kwargs): """ Computes the performance metrics on the given input data and target values in the best model found. Args: x (numpy.ndarray): Input data. y (numpy.ndarray): Target values (class labels in classification, real numbers in regression). **kwargs: Only used for classification, in order to set the value of the parameter ``imbalance_correction``. Returns: tuple: Returns a tuple containing the performance metric according to the type of model. """ overlapping_type, overlapping_epochs, stride, timesteps = prepare_param_overlapping( self.get_params()) x, y = create_overlapping(x, y, self.best_model, overlapping_epochs, overlapping_type, n_stride=stride, n_timesteps=timesteps) return self.best_model.score(x, y, **kwargs)
[docs] @abc.abstractmethod def fit(self, **kwargs): """ Optimize the model following the HOSA approach with all sets of parameters. Args: **kwargs: Extra arguments explicitly used for regression or classification models. """ raise NotImplementedError
[docs]class HOSACNN(BaseHOSA): """ Heuristic Oriented Search Algorithm (HOSA) for CNNs. This class implments the HOSA for optimizing CNNs. Following a heuristic search, the algorithm finetunes the most relevant models' parameters. Thus, HOSA avoids testing every possible combination, and therefore, an exhaustive search. Args: x (numpy.ndarray): Input data. y (numpy.ndarray): Target values (class labels in classification, real numbers in regression). model (object): Class of the object to be optimized. Available options are: :class:`.RNNClassification`, :class:`.RNNRegression`, :class:`.CNNClassification` and :class:`.CNNRegression`. n_outputs (int): Number of class labels in classification, or the number of numerical values to predict in regression. parameters (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values. tr (float): Minimum threshold of improvement of the performance metric. apply_rsv (bool): ``True`` if random sub-sampling validation should be used during the optimization procedure. validation_size (float): Proportion of the dataset to include in the validation split on the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. n_splits (int): Number of splits used in the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. Examples: .. code-block:: python :linenos: import numpy as np from sklearn.model_selection import train_test_split from hosa.models.cnn import CNNRegression from hosa.optimization.hosa import HOSACNN # 1 - Load the dataset dataset = np.loadtxt('...', delimiter=',') x = dataset[:, :-1] y = dataset[:, -1] # 2 - Split the data in train and test dataset x_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.3, shuffle=False) # 3 - Set the parameters to optimize param_grid_rnn = { 'overlapping_type': ['central', 'left'], 'overlapping_epochs': [1], 'n_kernels_first_gol': [16, 32], 'activation_function_dense': ['relu'], 'mults': [1, 2], 'optimizer': ['adam'], 'batch_size': [32], } # 4 - Create a HOSA instance and find the best model regr = HOSACNN(x_train, y_train, CNNRegression, 1, param_grid_rnn, 0.01, apply_rsv=False) regr.fit(max_gol_sizes=4, show_progress=True, verbose=1, shuffle=False) score = regr.score(X_test, y_test) """ def __init__(self, x, y, model, n_outputs, parameters, tr, apply_rsv=True, validation_size=.25, n_splits=10): self.required_parameters = ['n_kernels_first_gol', 'mults'] super().__init__(x, y, model, n_outputs, parameters, tr, apply_rsv, validation_size, n_splits) self.n_kernels_first_gol = self.parameters[0]['n_kernels_first_gol'] self.mults = self.parameters[0]['mults'] del self.parameters[0]['n_kernels_first_gol'] del self.parameters[0]['mults']
[docs] def fit(self, max_gol_sizes, show_progress=True, imbalance_correction=None, **kwargs): """ Optimize the model following the HOSA approach with all sets of parameters. Args: max_gol_sizes (int): Maximum number of GofLayers to add to the model. show_progress (bool): `True` to show a progress bar; `False` otherwise. imbalance_correction (bool): Whether to apply correction to class imbalances. imbalance_correction (None or bool): Whether to apply correction to class imbalances. **Only used for classification problems. Ignored for regression.** **kwargs: Extra arguments explicitly used for regression or classification models, including the additional arguments that are used in the TensorFlow's model ``fit`` function. See `here <https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit>`_. Returns: tuple: Returns a tuple containing the object of the best model found and respective performance metric and optimized parameters. """ # Initalize variables best_metric = self.initial_metric_value best_model = best_specification = None k_construction_size = [] stop = False # Show progess bar? with tqdm(total=max_gol_sizes, disable=not show_progress, colour='green') as pbar_all: # Perform optimization while len(k_construction_size) < max_gol_sizes and not stop: # Inicialize the best current metric for comparing with the best metric found best_metric_current = self.initial_metric_value best_model_current = best_specification_current = None # If there is just one GofL if len(k_construction_size) == 0: n_kernels_test = np.array(self.n_kernels_first_gol).reshape( (len(self.n_kernels_first_gol), 1)) else: n_kernels_test = [ k_construction_size + [np.floor(k_construction_size[-1] * mult)] for mult in self.mults] # Test each kernel size for n_kernel in n_kernels_test: # Run grid search specification, model, metric = self.grid_search(n_kernel, imbalance_correction, **kwargs) # Compare with the current metrics, and update the current best values if # necessary if self.compare_function(metric, best_metric_current): best_model_current = model best_metric_current = metric best_specification_current = specification # Check the stopping criterion if self.stop_check(best_metric_current, best_metric): if self.compare_function(best_metric_current, best_metric): self.best_model = best_model_current self.best_metric = best_metric_current self.best_specification = best_specification_current else: self.best_model = best_model self.best_metric = best_metric self.best_specification = best_specification stop = True else: best_model = best_model_current best_metric = best_metric_current best_specification = best_specification_current k_construction_size.append(best_model.n_kernels[-1]) # Update progress bar best_specification_complete = best_model.__dict__() pbar_all.set_postfix(n_goflayers=len(best_specification_complete['n_kernels']), no_kernels=best_specification_complete['n_kernels']) pbar_all.update(1) best_specification.update(self.best_model.__dict__()) self.best_model = best_model self.best_metric = best_metric self.best_specification = best_specification return self.best_model, self.best_metric, self.best_specification
[docs]class HOSARNN(BaseHOSA): """ Heuristic Oriented Search Algorithm (HOSA) for RNNs. This class implments the HOSA for optimizing RNNs. Following a heuristic search, the algorithm finetunes the most relevant models' parameters. Thus, HOSA avoids testing every possible combination, and therefore, an exhaustive search. Args: x (numpy.ndarray): Input data. y (numpy.ndarray): Target values (class labels in classification, real numbers in regression). model (object): Class of the object to be optimized. Available options are: :class:`.RNNClassification`, :class:`.RNNRegression`, :class:`.CNNClassification` and :class:`.CNNRegression`. n_outputs (int): Number of class labels in classification, or the number of numerical values to predict in regression. parameters (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values. tr (float): Minimum threshold of improvement of the performance metric. apply_rsv (bool): ``True`` if random sub-sampling validation should be used during the optimization procedure. validation_size (float): Proportion of the dataset to include in the validation split on the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. n_splits (int): Number of splits used in the random sub-sampling validation. **Ignored if ``apply_rsv = False``**. Examples: .. code-block:: python :linenos: import numpy as np from sklearn.model_selection import train_test_split from hosa.models.rnn import RNNClassification from hosa.optimization.hosa import HOSARNN # 1 - Load the dataset dataset = np.loadtxt('...', delimiter=',') x = dataset[:, :-1] y = dataset[:, -1] # 2 - Split the data in train and test dataset x_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.1, shuffle=False) # 3 - Set the parameters to optimize param_grid_rnn = { 'overlapping_type': ['central', 'left'], 'model_type': ['lstm', 'gru'], 'overlapping_epochs': [1], 'timesteps': [1], 'activation_function_dense': ['relu'], 'n_units': [10, 12], 'mults': [1, 2], 'optimizer': ['adam'], 'batch_size': [32], } # 4 - Create a HOSA instance and find the best model clf = HOSARNN(x_train, y_train, RNNClassification, 2, param_grid_rnn, 0.01, validation_size=.05, apply_rsv=False) clf.fit(max_n_subs_layers=4, show_progress=True, verbose=0, shuffle=False, imbalance_correction=True) score = clf.score(X_test, y_test) """ def __init__(self, x, y, model, n_outputs, parameters, tr, apply_rsv=True, validation_size=.25, n_splits=10): self.required_parameters = ['n_units', 'mults'] super().__init__(x, y, model, n_outputs, parameters, tr, apply_rsv, validation_size, n_splits) self.n_units = self.parameters[0]['n_units'] self.mults = self.parameters[0]['mults'] del self.parameters[0]['n_units'] del self.parameters[0]['mults']
[docs] def fit(self, max_n_subs_layers, show_progress=True, imbalance_correction=None, **kwargs): """ Optimize the model following the HOSA approach with all sets of parameters. Args: max_n_subs_layers (int): Maximum number of subsequent layers to add to the model. show_progress (bool): `True` to show a progress bar; `False` otherwise. imbalance_correction (None or bool): Whether to apply correction to class imbalances. **Only used for classification problems. Ignored for regression.** **kwargs: Extra arguments explicitly used for regression or classification models, including the additional arguments that are used in the TensorFlow's model ``fit`` function. See `here <https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit>`_. Returns: tuple: Returns a tuple containing the object of the best model found and respective performance metric and optimized parameters. """ # Initalize variables best_metric = self.initial_metric_value best_model = best_specification = None n_subs_layers_construction = 1 stop = False # Show progess bar? with tqdm(total=max_n_subs_layers, disable=not show_progress, colour='green') as pbar_all: # Perform optimization while n_subs_layers_construction < max_n_subs_layers and not stop: # Inicialize the best current metric for comparing with the best metric found best_metric_current = self.initial_metric_value best_model_current = best_specification_current = None # Test each number of hidden units for n_units in self.n_units: # Test each number of units in the dense layer for mult in self.mults: # Run grid search specification, model, metric = self.grid_search( n_neurons_dense_layer=np.floor(n_units * mult), n_units=n_units, n_subs_layers=n_subs_layers_construction, imbalance_correction=imbalance_correction, **kwargs) # Compare with the current metrics, and update the current best values if # necessary if self.compare_function(metric, best_metric_current): best_model_current = model best_metric_current = metric best_specification_current = specification # Check the stopping criterion if self.stop_check(best_metric_current, best_metric): if self.compare_function(best_metric_current, best_metric): self.best_model = best_model_current self.best_metric = best_metric_current self.best_specification = best_specification_current else: self.best_model = best_model self.best_metric = best_metric self.best_specification = best_specification stop = True else: best_model = best_model_current best_metric = best_metric_current best_specification = best_specification_current n_subs_layers_construction = n_subs_layers_construction + 1 # Update progress bar best_specification_complete = best_model.__dict__() pbar_all.set_postfix(n_subs_layers=best_specification_complete['n_subs_layers'], n_units=best_specification_complete['n_units'], n_hidden_dense=best_specification_complete[ 'n_neurons_dense_layer']) pbar_all.update(1) self.best_model = best_model self.best_metric = best_metric self.best_specification = best_specification return self.best_model, self.best_metric, self.best_specification