Source code for pysptools.ml.hyperxgb

#
#------------------------------------------------------------------------------
# Copyright (c) 2013-2018, Christian Therien
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#------------------------------------------------------------------------------
#
# hyperxgb.py - This file is part of the PySptools package.
#


# Use with xgboost version 0.6a2

import numpy as np
import pickle

from pysptools.skl.base import HyperBaseClassifier
from pysptools.ml.plotting import _plot_feature_importances
from xgboost import XGBClassifier


# TODO:
# partial_fit
# Notes:
# seed pas utilise, utilise par mknfold et cv

# Hacks needed to run load_xgb_model()
# Add to XGBClassifier class (file sklearn.py):
#    # Patch pysptools
#    def set_le(self, y):
#        self._le = XGBLabelEncoder().fit(y)
#    # end patch pysptools

[docs]def load_xgb_model(fname): """ Load a XGBoost model that was saved as a file with the HyperXGBClassifier.save method. The model is span on two files: * The first file contains the model saved with the Booster class, this file have no extension. * The second file contains the parameters used to create the model, this file have the extension '.p'. Parameters ---------- fname : path The file name without extension. Returns ------- HyperXGBClassifier class : a model instance """ from xgboost import Booster params = pickle.load(open(fname+'.p', "rb")) n_classes = params['meta']['n_classes'] param_map = params['param_map'] model = HyperXGBClassifier(**param_map) model.set_n_labels(n_classes-1) y = [i for i in range(n_classes)] model.set_le(y) model._Booster = Booster(model_file=fname) return model
[docs]class HyperXGBClassifier(XGBClassifier, HyperBaseClassifier): """ XGBoost classifier for Hyperspectral Imaging. The class implement the scikit-learn API and is a pysptools submodule. This class add the save and load model functionalities. Following is a copy and paste form XGBModel documentation. Implementation of the Scikit-Learn API for XGBoost. Parameters ---------- max_depth : int Maximum tree depth for base learners. learning_rate : float Boosting learning rate (xgb's "eta") n_estimators : int Number of boosted trees to fit. silent : boolean Whether to print messages while running boosting. objective : string or callable Specify the learning task and the corresponding learning objective or a custom objective function to be used (see note below). booster: string Specify which booster to use: gbtree, gblinear or dart. nthread : int Number of parallel threads used to run xgboost. (Deprecated, please use n_jobs) n_jobs : int Number of parallel threads used to run xgboost. (replaces nthread) gamma : float Minimum loss reduction required to make a further partition on a leaf node of the tree. min_child_weight : int Minimum sum of instance weight(hessian) needed in a child. max_delta_step : int Maximum delta step we allow each tree's weight estimation to be. subsample : float Subsample ratio of the training instance. colsample_bytree : float Subsample ratio of columns when constructing each tree. colsample_bylevel : float Subsample ratio of columns for each split, in each level. reg_alpha : float (xgb's alpha) L1 regularization term on weights reg_lambda : float (xgb's lambda) L2 regularization term on weights scale_pos_weight : float Balancing of positive and negative weights. base_score: The initial prediction score of all instances, global bias. seed : int Random number seed. (Deprecated, please use random_state) random_state : int Random number seed. (replaces seed) missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. Note ---- A custom objective function can be provided for the ``objective`` parameter. In this case, it should have the signature ``objective(y_true, y_pred) -> grad, hess``: y_true: array_like of shape [n_samples] The target values y_pred: array_like of shape [n_samples] The predicted values grad: array_like of shape [n_samples] The value of the gradient for each sample point. hess: array_like of shape [n_samples] The value of the second derivative for each sample point """ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="reg:linear", booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None): super(HyperXGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) HyperBaseClassifier.__init__(self, 'HyperXGBClassifier')
[docs] def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None): """ Fit gradient boosting classifier Parameters ---------- X : array_like Feature matrix y : array_like Labels sample_weight : array_like Weight for each instance eval_set : list, optional A list of (X, y) pairs to use as a validation set for early-stopping eval_metric : str, callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.md. If callable, a custom evaluation metric. The call signature is func(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. It must return a str, value pair where the str is a name for the evaluation and value is the value of the evaluation function. This objective is always minimized. early_stopping_rounds : int, optional Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. xgb_model : str file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). """ super(HyperXGBClassifier, self)._set_n_clusters(int(np.max(y))) super(HyperXGBClassifier, self).fit(X=X, y=y, sample_weight=sample_weight, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, xgb_model=xgb_model)
[docs] def partial_fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None): """ See fit() method doc """ super(HyperXGBClassifier, self)._set_n_clusters(int(np.max(y))) super(HyperXGBClassifier, self).partial_fit(X=X, y=y, sample_weight=sample_weight, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, xgb_model=xgb_model)
[docs] def fit_rois(self, M, ROIs): """ Fit the HS cube M with the use of ROIs. Parameters ---------- M : numpy array A HSI cube (m x n x p). ROIs : ROIs class type Regions of interest instance. """ X, y = self._fit_rois(M, ROIs) super(HyperXGBClassifier, self).fit(X, y)
def set_n_labels(self, n): # hack for save and load functionalities super(HyperXGBClassifier, self)._set_n_clusters(n) def set_le(self, y): # hack for save and load functionalities super(HyperXGBClassifier, self).set_le(y)
[docs] def classify(self, M, output_margin=False, ntree_limit=0): """ Classify a hyperspectral cube. Parameters ---------- M : numpy array A HSI cube (m x n x p). Returns ------- numpy array : a class map (m x n x 1) """ img = self._convert2D(M) cls = super(HyperXGBClassifier, self).predict(img, output_margin=output_margin, ntree_limit=ntree_limit) cmap = self._convert3d(cls, M.shape[0], M.shape[1]) super(HyperXGBClassifier, self)._set_cmap(cmap) return self.cmap
[docs] def save(self, fname, n_features, n_classes): """ Save the model and is parameters in two files. When the model is loaded, it instantiate an object of class HyperXGBClassifier. See load_xgb_model function doc. Parameters ---------- fname : path The model file name. n_features : int The model number of features. n_classes : int The model number of classes, ex. for a binary model n_classes = 2 (the background is a class for pysptools). """ meta = {'n_features':n_features, 'n_classes':n_classes} param_map = self.get_xgb_params() params = {'meta':meta, 'param_map': param_map} pickle.dump( params, open(fname+'.p', "wb" )) self.get_booster().save_model(fname)
#self.booster().save_model(fname)
[docs] def plot_feature_importances(self, path, n_labels='all', sort=False, suffix=''): """ Plot the feature importances. The output can be split in n graphs. Parameters ---------- path : string The path where to save the plot. n_labels : string or integer The number of labels to output by graph. If the value is 'all', only one graph is generated. sort : boolean [default False] If true the feature importances are sorted. suffix : string [default None] Add a suffix to the file name. """ _plot_feature_importances('HyperXGBC', self.feature_importances_, path, n_labels=n_labels, sort=sort, suffix=suffix)
[docs] def display_feature_importances(self, n_labels='all', sort=False, suffix=''): """ Display the feature importances. The output can be split in n graphs. Parameters ---------- n_labels : string or integer The number of labels to output by graph. If the value is 'all', only one graph is generated. sort : boolean [default False] If true the feature importances are sorted. suffix : string [default None] Add a suffix to the file name. """ _plot_feature_importances('', self.feature_importances_, None, n_labels=n_labels, sort=sort, suffix=suffix)