#
#------------------------------------------------------------------------------
# Copyright (c) 2013-2018, Christian Therien
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#------------------------------------------------------------------------------
#
# hyperxgb.py - This file is part of the PySptools package.
#
# Use with xgboost version 0.6a2
import numpy as np
import pickle
from pysptools.skl.base import HyperBaseClassifier
from pysptools.ml.plotting import _plot_feature_importances
from xgboost import XGBClassifier
# TODO:
# partial_fit
# Notes:
# seed pas utilise, utilise par mknfold et cv
# Hacks needed to run load_xgb_model()
# Add to XGBClassifier class (file sklearn.py):
# # Patch pysptools
# def set_le(self, y):
# self._le = XGBLabelEncoder().fit(y)
# # end patch pysptools
[docs]def load_xgb_model(fname):
""" Load a XGBoost model that was saved as a file with
the HyperXGBClassifier.save method.
The model is span on two files:
* The first file contains the model saved with the Booster class,
this file have no extension.
* The second file contains the parameters used to create the model,
this file have the extension '.p'.
Parameters
----------
fname : path
The file name without extension.
Returns
-------
HyperXGBClassifier class : a model instance
"""
from xgboost import Booster
params = pickle.load(open(fname+'.p', "rb"))
n_classes = params['meta']['n_classes']
param_map = params['param_map']
model = HyperXGBClassifier(**param_map)
model.set_n_labels(n_classes-1)
y = [i for i in range(n_classes)]
model.set_le(y)
model._Booster = Booster(model_file=fname)
return model
[docs]class HyperXGBClassifier(XGBClassifier, HyperBaseClassifier):
"""
XGBoost classifier for Hyperspectral Imaging.
The class implement the scikit-learn API and is a pysptools submodule.
This class add the save and load model functionalities.
Following is a copy and paste form XGBModel documentation.
Implementation of the Scikit-Learn API for XGBoost.
Parameters
----------
max_depth : int
Maximum tree depth for base learners.
learning_rate : float
Boosting learning rate (xgb's "eta")
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
booster: string
Specify which booster to use: gbtree, gblinear or dart.
nthread : int
Number of parallel threads used to run xgboost. (Deprecated, please use n_jobs)
n_jobs : int
Number of parallel threads used to run xgboost. (replaces nthread)
gamma : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child.
max_delta_step : int
Maximum delta step we allow each tree's weight estimation to be.
subsample : float
Subsample ratio of the training instance.
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
colsample_bylevel : float
Subsample ratio of columns for each split, in each level.
reg_alpha : float (xgb's alpha)
L1 regularization term on weights
reg_lambda : float (xgb's lambda)
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
base_score:
The initial prediction score of all instances, global bias.
seed : int
Random number seed. (Deprecated, please use random_state)
random_state : int
Random number seed. (replaces seed)
missing : float, optional
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
grad: array_like of shape [n_samples]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples]
The value of the second derivative for each sample point
"""
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
silent=True, objective="reg:linear", booster='gbtree',
n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
subsample=1, colsample_bytree=1, colsample_bylevel=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
base_score=0.5, random_state=0, seed=None, missing=None):
super(HyperXGBClassifier, self).__init__(max_depth, learning_rate,
n_estimators, silent, objective, booster,
n_jobs, nthread, gamma, min_child_weight,
max_delta_step, subsample,
colsample_bytree, colsample_bylevel,
reg_alpha, reg_lambda,
scale_pos_weight, base_score,
random_state, seed, missing)
HyperBaseClassifier.__init__(self, 'HyperXGBClassifier')
[docs] def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, xgb_model=None):
"""
Fit gradient boosting classifier
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
sample_weight : array_like
Weight for each instance
eval_set : list, optional
A list of (X, y) pairs to use as a validation set for
early-stopping
eval_metric : str, callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a
DMatrix object such that you may need to call the get_label
method. It must return a str, value pair where the str is a name
for the evaluation and value is the value of the evaluation
function. This objective is always minimized.
early_stopping_rounds : int, optional
Activates early stopping. Validation error needs to decrease at
least every <early_stopping_rounds> round(s) to continue training.
Requires at least one item in evals. If there's more than one,
will use the last. Returns the model from the last iteration
(not the best one). If early stopping occurs, the model will
have three additional fields: bst.best_score, bst.best_iteration
and bst.best_ntree_limit.
(Use bst.best_ntree_limit to get the correct value if num_parallel_tree
and/or num_class appears in the parameters)
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
metric measured on the validation set to stderr.
xgb_model : str
file name of stored xgb model or 'Booster' instance Xgb model to be
loaded before training (allows training continuation).
"""
super(HyperXGBClassifier, self)._set_n_clusters(int(np.max(y)))
super(HyperXGBClassifier, self).fit(X=X, y=y, sample_weight=sample_weight,
eval_set=eval_set, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, verbose=verbose,
xgb_model=xgb_model)
[docs] def partial_fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, xgb_model=None):
""" See fit() method doc """
super(HyperXGBClassifier, self)._set_n_clusters(int(np.max(y)))
super(HyperXGBClassifier, self).partial_fit(X=X, y=y, sample_weight=sample_weight,
eval_set=eval_set, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, verbose=verbose,
xgb_model=xgb_model)
[docs] def fit_rois(self, M, ROIs):
"""
Fit the HS cube M with the use of ROIs.
Parameters
----------
M : numpy array
A HSI cube (m x n x p).
ROIs : ROIs class type
Regions of interest instance.
"""
X, y = self._fit_rois(M, ROIs)
super(HyperXGBClassifier, self).fit(X, y)
def set_n_labels(self, n):
# hack for save and load functionalities
super(HyperXGBClassifier, self)._set_n_clusters(n)
def set_le(self, y):
# hack for save and load functionalities
super(HyperXGBClassifier, self).set_le(y)
[docs] def classify(self, M, output_margin=False, ntree_limit=0):
"""
Classify a hyperspectral cube.
Parameters
----------
M : numpy array
A HSI cube (m x n x p).
Returns
-------
numpy array : a class map (m x n x 1)
"""
img = self._convert2D(M)
cls = super(HyperXGBClassifier, self).predict(img,
output_margin=output_margin, ntree_limit=ntree_limit)
cmap = self._convert3d(cls, M.shape[0], M.shape[1])
super(HyperXGBClassifier, self)._set_cmap(cmap)
return self.cmap
[docs] def save(self, fname, n_features, n_classes):
"""
Save the model and is parameters in two files.
When the model is loaded, it instantiate an object of class
HyperXGBClassifier. See load_xgb_model function doc.
Parameters
----------
fname : path
The model file name.
n_features : int
The model number of features.
n_classes : int
The model number of classes, ex. for a binary model
n_classes = 2 (the background is a class for pysptools).
"""
meta = {'n_features':n_features, 'n_classes':n_classes}
param_map = self.get_xgb_params()
params = {'meta':meta, 'param_map': param_map}
pickle.dump( params, open(fname+'.p', "wb" ))
self.get_booster().save_model(fname)
#self.booster().save_model(fname)
[docs] def plot_feature_importances(self, path, n_labels='all', sort=False, suffix=''):
"""
Plot the feature importances.
The output can be split in n graphs.
Parameters
----------
path : string
The path where to save the plot.
n_labels : string or integer
The number of labels to output by graph. If the value is 'all',
only one graph is generated.
sort : boolean [default False]
If true the feature importances are sorted.
suffix : string [default None]
Add a suffix to the file name.
"""
_plot_feature_importances('HyperXGBC', self.feature_importances_, path,
n_labels=n_labels, sort=sort, suffix=suffix)
[docs] def display_feature_importances(self, n_labels='all', sort=False, suffix=''):
"""
Display the feature importances.
The output can be split in n graphs.
Parameters
----------
n_labels : string or integer
The number of labels to output by graph. If the value is 'all',
only one graph is generated.
sort : boolean [default False]
If true the feature importances are sorted.
suffix : string [default None]
Add a suffix to the file name.
"""
_plot_feature_importances('', self.feature_importances_, None,
n_labels=n_labels, sort=sort, suffix=suffix)