Source code for pysptools.ml.hyperlgbm

#
#------------------------------------------------------------------------------
# Copyright (c) 2013-2018, Christian Therien
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#------------------------------------------------------------------------------
#
# hyperlgbm.py - This file is part of the PySptools package.
#

import numpy as np
import pickle

from pysptools.skl.base import HyperBaseClassifier
from pysptools.ml.plotting import _plot_feature_importances
from lightgbm import LGBMClassifier


# Hacks needed to run load_lgbm_model()
# Add to LGBMModel class (file sklearn.py):
#    # Patch pysptools
#    def set_n_features_(self, n):
#        self._n_features = n
#    # end patch pysptools
#
# Add to LGBMClassifier class:
#    # Patch pysptools
#    def set_le(self, y):
#        self._le = _LGBMLabelEncoder().fit(y)
#        self._classes = self._le.classes_
#        self._n_classes = len(self._classes)
#
#    def set_n_features_(self, n):
#        super(LGBMClassifier, self).set_n_features_(n)
#    # end patch pysptools


[docs]def load_lgbm_model(fname):
    """ Load a LightGBM model that was saved as a file with
        the HyperLGBMClassifier.save method.
        
        The model is span on two files:
            
            * The first file contains the model saved with the Booster class,
            this file have no extension.
            
            * The second file contains the parameters used to create the model,
            this file have the extension '.p'.
            
        Parameters
        ----------
        fname : path 
                The file name without extension.

        Returns
        -------
        HyperLGBMClassifier class : a model instance
        """
    from lightgbm import Booster
    params = pickle.load(open(fname+'.p', "rb"))
    n_features = params['meta']['n_features']
    n_classes = params['meta']['n_classes']
    param_map = params['param_map']
    model = HyperLGBMClassifier(**param_map)
    model.set_n_labels(n_classes-1)
    y = [i for i in range(n_classes)]
    model.set_le(y)
    model.set_n_features_(n_features)
    model._Booster = Booster(model_file=fname)
    return model

  
[docs]class HyperLGBMClassifier(LGBMClassifier, HyperBaseClassifier):
    """
    LightGBM classifier for Hyperspectral Imaging.
    The class implement the scikit-learn API and is a pysptools submodule.
    
    This class add the save and load model functionalities.
    Following is a copy and paste form XGBModel documentation.

    Construct a gradient boosting model.

        Parameters
        ----------
        Parameters
        ----------
        boosting_type : string, optional (default="gbdt")
            'gbdt', traditional Gradient Boosting Decision Tree.
            'dart', Dropouts meet Multiple Additive Regression Trees.
            'goss', Gradient-based One-Side Sampling.
            'rf', Random Forest.
        num_leaves : int, optional (default=31)
            Maximum tree leaves for base learners.
        max_depth : int, optional (default=-1)
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float, optional (default=0.1)
            Boosting learning rate.
        n_estimators : int, optional (default=100)
            Number of boosted trees to fit.
        subsample_for_bin : int, optional (default=50000)
            Number of samples for constructing bins.
        objective : string, callable or None, optional (default=None)
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
        class_weight : dict, 'balanced' or None, optional (default=None)
            Weights associated with classes in the form ``{class_label: weight}``.
            Use this parameter only for multi-class classification task;
            for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
            The 'balanced' mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
            If None, all classes are supposed to have weight one.
            Note that these weights will be multiplied with ``sample_weight`` (passed through the fit method)
            if ``sample_weight`` is specified.
        min_split_gain : float, optional (default=0.)
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : float, optional (default=1e-3)
            Minimum sum of instance weight(hessian) needed in a child(leaf).
        min_child_samples : int, optional (default=20)
            Minimum number of data need in a child(leaf).
        subsample : float, optional (default=1.)
            Subsample ratio of the training instance.
        subsample_freq : int, optional (default=1)
            Frequence of subsample, <=0 means no enable.
        colsample_bytree : float, optional (default=1.)
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float, optional (default=0.)
            L1 regularization term on weights.
        reg_lambda : float, optional (default=0.)
            L2 regularization term on weights.
        random_state : int or None, optional (default=None)
            Random number seed.
            Will use default seeds in c++ code if set to None.
        n_jobs : int, optional (default=-1)
            Number of parallel threads.
        silent : bool, optional (default=True)
            Whether to print messages while running boosting.

        Attributes
        ----------
        n_features_ : int
            The number of features of fitted model.
        classes_ : array of shape = [n_classes]
            The class label array (only for classification problem).
        n_classes_ : int
            The number of classes (only for classification problem).
        best_score_ : dict or None
            The best score of fitted model.
        best_iteration_ : int or None
            The best iteration of fitted model if ``early_stopping_rounds`` has been specified.
        objective_ : string or callable
            The concrete objective used while fitting this model.
        booster_ : Booster
            The underlying Booster of this model.
        evals_result_ : dict or None
            The evaluation results if ``early_stopping_rounds`` has been specified.
        feature_importances_ : array of shape = [n_features]
            The feature importances (the higher, the more important the feature).

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
        ``objective(y_true, y_pred) -> grad, hess`` or
        ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array-like of shape = [n_samples]
                The target values.
            y_pred: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                The predicted values.
            group: array-like
                Group/query data, used for ranking task.
            grad: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                The value of the gradient for each sample point.
            hess: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                The value of the second derivative for each sample point.

        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
        and you should group grad and hess in this way as well.
        """
    
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=100,
                 subsample_for_bin=200000, objective=None, class_weight=None,
                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
                 subsample=1., subsample_freq=1, colsample_bytree=1.,
                 reg_alpha=0., reg_lambda=0., random_state=None,
                 n_jobs=-1, silent=True):
        super(HyperLGBMClassifier, self).__init__(boosting_type, num_leaves, max_depth,
                 learning_rate, n_estimators,
                 subsample_for_bin, objective, class_weight,
                 min_split_gain, min_child_weight, min_child_samples,
                 subsample, subsample_freq, colsample_bytree,
                 reg_alpha, reg_lambda, random_state,
                 n_jobs, silent)
        HyperBaseClassifier.__init__(self, 'HyperLGBMClassifier')
 
[docs]    def fit(self, X, y, 
            sample_weight=None, init_score=None,
            eval_set=None, eval_names=None, eval_sample_weight=None,
            eval_class_weight=None, eval_init_score=None, eval_metric="logloss",
            early_stopping_rounds=None, verbose=True,
            feature_name='auto', categorical_feature='auto', callbacks=None):
        """
        Build a gradient boosting model from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in regression).
        sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
            Weights of training data.
        init_score : array-like of shape = [n_samples] or None, optional (default=None)
            Init score of training data.
        group : array-like of shape = [n_samples] or None, optional (default=None)
            Group data of training data.
        eval_set : list or None, optional (default=None)
            A list of (X, y) tuple pairs to use as a validation sets for early-stopping.
        eval_names : list of strings or None, optional (default=None)
            Names of eval_set.
        eval_sample_weight : list of arrays or None, optional (default=None)
            Weights of eval data.
        eval_class_weight : list or None, optional (default=None)
            Class weights of eval data.
        eval_init_score : list of arrays or None, optional (default=None)
            Init score of eval data.
        eval_group : list of arrays or None, optional (default=None)
            Group data of eval data.
        eval_metric : string, list of strings, callable or None, optional (default=None)
            If string, it should be a built-in evaluation metric to use.
            If callable, it should be a custom evaluation metric, see note for more details.
        early_stopping_rounds : int or None, optional (default=None)
            Activates early stopping. The model will train until the validation score stops improving.
            Validation error needs to decrease at least every ``early_stopping_rounds`` round(s)
            to continue training.
        verbose : bool, optional (default=True)
            If True and an evaluation set is used, writes the evaluation progress.
        feature_name : list of strings or 'auto', optional (default="auto")
            Feature names.
            If 'auto' and data is pandas DataFrame, data columns names are used.
        categorical_feature : list of strings or int, or 'auto', optional (default="auto")
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
            If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
        callbacks : list of callback functions or None, optional (default=None)
            List of callback functions that are applied at each iteration.
            See Callbacks in Python API for more information.

        Returns
        -------
        self : object
            Returns self.

        Note
        ----
        Custom eval function expects a callable with following functions:
        ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
        ``func(y_true, y_pred, weight, group)``.
        Returns (eval_name, eval_result, is_bigger_better) or
        list of (eval_name, eval_result, is_bigger_better)

            y_true: array-like of shape = [n_samples]
                The target values.
            y_pred: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class)
                The predicted values.
            weight: array-like of shape = [n_samples]
                The weight of samples.
            group: array-like
                Group/query data, used for ranking task.
            eval_name: str
                The name of evaluation.
            eval_result: float
                The eval result.
            is_bigger_better: bool
                Is eval result bigger better, e.g. AUC is bigger_better.

        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
        """
        super(HyperLGBMClassifier, self)._set_n_clusters(int(np.max(y)))
        super(HyperLGBMClassifier, self).fit(X=X, y=y,
             sample_weight=sample_weight, init_score=init_score,
             eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
             eval_init_score=eval_init_score, eval_metric=eval_metric,
             early_stopping_rounds=None, verbose=True,
             feature_name=feature_name, categorical_feature=categorical_feature,
             callbacks=callbacks)

[docs]    def partial_fit(self, X, y,
            sample_weight=None, init_score=None,
            eval_set=None, eval_names=None, eval_sample_weight=None,
            eval_init_score=None, eval_metric="logloss",
            early_stopping_rounds=None, verbose=True,
            feature_name='auto', categorical_feature='auto', callbacks=None):
        """ See fit() method doc """
        super(HyperLGBMClassifier, self)._set_n_clusters(int(np.max(y)))
        super(HyperLGBMClassifier, self).partial_fit(X=X, y=y,
             sample_weight=sample_weight, init_score=init_score,
             eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
             eval_init_score=eval_init_score, eval_metric=eval_metric,
             early_stopping_rounds=None, verbose=True,
             feature_name=feature_name, categorical_feature=categorical_feature,
             callbacks=callbacks)
    
[docs]    def fit_rois(self, M, ROIs):
        """
        Fit the HS cube M with the use of ROIs.

        Parameters
        ----------      
        M : numpy array
            A HSI cube (m x n x p).

        ROIs : ROIs class type
               Regions of interest instance.
        """
        X, y = self._fit_rois(M, ROIs)
        super(HyperLGBMClassifier, self).fit(X, y)

    def set_n_labels(self, n):
        # hack for save and load functionalities
        super(HyperLGBMClassifier, self)._set_n_clusters(n)
        
    def set_le(self, y):
        # hack for save and load functionalities
        super(HyperLGBMClassifier, self).set_le(y)
        
    def set_n_features_(self, n):
        # hack for save and load functionalities
        super(HyperLGBMClassifier, self).set_n_features_(n)
        
[docs]    def classify(self, M, raw_score=False, num_iteration=0):
        """
        Classify a hyperspectral cube.

        Parameters
        ----------   
        M : numpy array
            A HSI cube (m x n x p).

        Returns
        -------
        numpy array : a class map (m x n x 1)
        """
        #from sklearn.preprocessing import LabelEncoder
        img = self._convert2D(M)
        cls = super(HyperLGBMClassifier, self).predict(img,
                         raw_score=raw_score, num_iteration=num_iteration)
        cmap = self._convert3d(cls, M.shape[0], M.shape[1])
        super(HyperLGBMClassifier, self)._set_cmap(cmap)
        return self.cmap

[docs]    def save(self, fname, n_features, n_classes):
        """
        Save the model and is parameters in two files.
        When the model is loaded, it instantiate an object of class
        HyperLGBMClassifier. See load_lgbm_model function doc.
        
        Parameters
        ----------
        fname : path
                The model file name.
        n_features : int
                     The model number of features.
        n_classes : int
                    The model number of classes, ex. for a binary model
                    n_classes = 2 (the background is a class for pysptools).
        """
        meta = {'n_features':n_features, 'n_classes':n_classes}
        param_map = self.get_params()
        params = {'meta':meta, 'param_map': param_map}
        pickle.dump( params, open(fname+'.p', "wb" ))
        self.booster_.save_model(fname)

[docs]    def plot_feature_importances(self, path, n_labels='all', sort=False, suffix=''):
        """
        Plot the feature importances.
        The output can be split in n graphs.

        Parameters
        ----------
        path : string
          The path where to save the plot.

        n_labels : string or integer
          The number of labels to output by graph. If the value is 'all',
          only one graph is generated.

        sort : boolean [default False]
          If true the feature importances are sorted.

        suffix : string [default None]
          Add a suffix to the file name.
        """
        _plot_feature_importances('HyperLGBM', self.feature_importances_, path, 
                                  n_labels=n_labels, sort=sort, suffix=suffix)

[docs]    def display_feature_importances(self, n_labels='all', sort=False, suffix=''):
        """
        Display the feature importances.
        The output can be split in n graphs.

        Parameters
        ----------
        n_labels : string or integer
          The number of labels to output by graph. If the value is 'all',
          only one graph is generated.

        sort : boolean [default False]
          If true the feature importances are sorted.

        suffix : string [default None]
          Add a suffix to the file name.
        """
        _plot_feature_importances('', self.feature_importances_, None, 
                                  n_labels=n_labels, sort=sort, suffix=suffix)