#
#------------------------------------------------------------------------------
# Copyright (c) 2013-2018, Christian Therien
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#------------------------------------------------------------------------------
#
# hyperlgbm.py - This file is part of the PySptools package.
#
import numpy as np
import pickle
from pysptools.skl.base import HyperBaseClassifier
from pysptools.ml.plotting import _plot_feature_importances
from lightgbm import LGBMClassifier
# Hacks needed to run load_lgbm_model()
# Add to LGBMModel class (file sklearn.py):
# # Patch pysptools
# def set_n_features_(self, n):
# self._n_features = n
# # end patch pysptools
#
# Add to LGBMClassifier class:
# # Patch pysptools
# def set_le(self, y):
# self._le = _LGBMLabelEncoder().fit(y)
# self._classes = self._le.classes_
# self._n_classes = len(self._classes)
#
# def set_n_features_(self, n):
# super(LGBMClassifier, self).set_n_features_(n)
# # end patch pysptools
[docs]def load_lgbm_model(fname):
""" Load a LightGBM model that was saved as a file with
the HyperLGBMClassifier.save method.
The model is span on two files:
* The first file contains the model saved with the Booster class,
this file have no extension.
* The second file contains the parameters used to create the model,
this file have the extension '.p'.
Parameters
----------
fname : path
The file name without extension.
Returns
-------
HyperLGBMClassifier class : a model instance
"""
from lightgbm import Booster
params = pickle.load(open(fname+'.p', "rb"))
n_features = params['meta']['n_features']
n_classes = params['meta']['n_classes']
param_map = params['param_map']
model = HyperLGBMClassifier(**param_map)
model.set_n_labels(n_classes-1)
y = [i for i in range(n_classes)]
model.set_le(y)
model.set_n_features_(n_features)
model._Booster = Booster(model_file=fname)
return model
[docs]class HyperLGBMClassifier(LGBMClassifier, HyperBaseClassifier):
"""
LightGBM classifier for Hyperspectral Imaging.
The class implement the scikit-learn API and is a pysptools submodule.
This class add the save and load model functionalities.
Following is a copy and paste form XGBModel documentation.
Construct a gradient boosting model.
Parameters
----------
Parameters
----------
boosting_type : string, optional (default="gbdt")
'gbdt', traditional Gradient Boosting Decision Tree.
'dart', Dropouts meet Multiple Additive Regression Trees.
'goss', Gradient-based One-Side Sampling.
'rf', Random Forest.
num_leaves : int, optional (default=31)
Maximum tree leaves for base learners.
max_depth : int, optional (default=-1)
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float, optional (default=0.1)
Boosting learning rate.
n_estimators : int, optional (default=100)
Number of boosted trees to fit.
subsample_for_bin : int, optional (default=50000)
Number of samples for constructing bins.
objective : string, callable or None, optional (default=None)
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
class_weight : dict, 'balanced' or None, optional (default=None)
Weights associated with classes in the form ``{class_label: weight}``.
Use this parameter only for multi-class classification task;
for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
The 'balanced' mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
If None, all classes are supposed to have weight one.
Note that these weights will be multiplied with ``sample_weight`` (passed through the fit method)
if ``sample_weight`` is specified.
min_split_gain : float, optional (default=0.)
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : float, optional (default=1e-3)
Minimum sum of instance weight(hessian) needed in a child(leaf).
min_child_samples : int, optional (default=20)
Minimum number of data need in a child(leaf).
subsample : float, optional (default=1.)
Subsample ratio of the training instance.
subsample_freq : int, optional (default=1)
Frequence of subsample, <=0 means no enable.
colsample_bytree : float, optional (default=1.)
Subsample ratio of columns when constructing each tree.
reg_alpha : float, optional (default=0.)
L1 regularization term on weights.
reg_lambda : float, optional (default=0.)
L2 regularization term on weights.
random_state : int or None, optional (default=None)
Random number seed.
Will use default seeds in c++ code if set to None.
n_jobs : int, optional (default=-1)
Number of parallel threads.
silent : bool, optional (default=True)
Whether to print messages while running boosting.
Attributes
----------
n_features_ : int
The number of features of fitted model.
classes_ : array of shape = [n_classes]
The class label array (only for classification problem).
n_classes_ : int
The number of classes (only for classification problem).
best_score_ : dict or None
The best score of fitted model.
best_iteration_ : int or None
The best iteration of fitted model if ``early_stopping_rounds`` has been specified.
objective_ : string or callable
The concrete objective used while fitting this model.
booster_ : Booster
The underlying Booster of this model.
evals_result_ : dict or None
The evaluation results if ``early_stopping_rounds`` has been specified.
feature_importances_ : array of shape = [n_features]
The feature importances (the higher, the more important the feature).
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess`` or
``objective(y_true, y_pred, group) -> grad, hess``:
y_true: array-like of shape = [n_samples]
The target values.
y_pred: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The predicted values.
group: array-like
Group/query data, used for ranking task.
grad: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the gradient for each sample point.
hess: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
The value of the second derivative for each sample point.
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
and you should group grad and hess in this way as well.
"""
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=100,
subsample_for_bin=200000, objective=None, class_weight=None,
min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
subsample=1., subsample_freq=1, colsample_bytree=1.,
reg_alpha=0., reg_lambda=0., random_state=None,
n_jobs=-1, silent=True):
super(HyperLGBMClassifier, self).__init__(boosting_type, num_leaves, max_depth,
learning_rate, n_estimators,
subsample_for_bin, objective, class_weight,
min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, random_state,
n_jobs, silent)
HyperBaseClassifier.__init__(self, 'HyperLGBMClassifier')
[docs] def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_names=None, eval_sample_weight=None,
eval_class_weight=None, eval_init_score=None, eval_metric="logloss",
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', callbacks=None):
"""
Build a gradient boosting model from the training set (X, y).
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
Input feature matrix.
y : array-like of shape = [n_samples]
The target values (class labels in classification, real numbers in regression).
sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
Weights of training data.
init_score : array-like of shape = [n_samples] or None, optional (default=None)
Init score of training data.
group : array-like of shape = [n_samples] or None, optional (default=None)
Group data of training data.
eval_set : list or None, optional (default=None)
A list of (X, y) tuple pairs to use as a validation sets for early-stopping.
eval_names : list of strings or None, optional (default=None)
Names of eval_set.
eval_sample_weight : list of arrays or None, optional (default=None)
Weights of eval data.
eval_class_weight : list or None, optional (default=None)
Class weights of eval data.
eval_init_score : list of arrays or None, optional (default=None)
Init score of eval data.
eval_group : list of arrays or None, optional (default=None)
Group data of eval data.
eval_metric : string, list of strings, callable or None, optional (default=None)
If string, it should be a built-in evaluation metric to use.
If callable, it should be a custom evaluation metric, see note for more details.
early_stopping_rounds : int or None, optional (default=None)
Activates early stopping. The model will train until the validation score stops improving.
Validation error needs to decrease at least every ``early_stopping_rounds`` round(s)
to continue training.
verbose : bool, optional (default=True)
If True and an evaluation set is used, writes the evaluation progress.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of strings or int, or 'auto', optional (default="auto")
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
callbacks : list of callback functions or None, optional (default=None)
List of callback functions that are applied at each iteration.
See Callbacks in Python API for more information.
Returns
-------
self : object
Returns self.
Note
----
Custom eval function expects a callable with following functions:
``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
``func(y_true, y_pred, weight, group)``.
Returns (eval_name, eval_result, is_bigger_better) or
list of (eval_name, eval_result, is_bigger_better)
y_true: array-like of shape = [n_samples]
The target values.
y_pred: array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class)
The predicted values.
weight: array-like of shape = [n_samples]
The weight of samples.
group: array-like
Group/query data, used for ranking task.
eval_name: str
The name of evaluation.
eval_result: float
The eval result.
is_bigger_better: bool
Is eval result bigger better, e.g. AUC is bigger_better.
For multi-class task, the y_pred is group by class_id first, then group by row_id.
If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
"""
super(HyperLGBMClassifier, self)._set_n_clusters(int(np.max(y)))
super(HyperLGBMClassifier, self).fit(X=X, y=y,
sample_weight=sample_weight, init_score=init_score,
eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
eval_init_score=eval_init_score, eval_metric=eval_metric,
early_stopping_rounds=None, verbose=True,
feature_name=feature_name, categorical_feature=categorical_feature,
callbacks=callbacks)
[docs] def partial_fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_names=None, eval_sample_weight=None,
eval_init_score=None, eval_metric="logloss",
early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', callbacks=None):
""" See fit() method doc """
super(HyperLGBMClassifier, self)._set_n_clusters(int(np.max(y)))
super(HyperLGBMClassifier, self).partial_fit(X=X, y=y,
sample_weight=sample_weight, init_score=init_score,
eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight,
eval_init_score=eval_init_score, eval_metric=eval_metric,
early_stopping_rounds=None, verbose=True,
feature_name=feature_name, categorical_feature=categorical_feature,
callbacks=callbacks)
[docs] def fit_rois(self, M, ROIs):
"""
Fit the HS cube M with the use of ROIs.
Parameters
----------
M : numpy array
A HSI cube (m x n x p).
ROIs : ROIs class type
Regions of interest instance.
"""
X, y = self._fit_rois(M, ROIs)
super(HyperLGBMClassifier, self).fit(X, y)
def set_n_labels(self, n):
# hack for save and load functionalities
super(HyperLGBMClassifier, self)._set_n_clusters(n)
def set_le(self, y):
# hack for save and load functionalities
super(HyperLGBMClassifier, self).set_le(y)
def set_n_features_(self, n):
# hack for save and load functionalities
super(HyperLGBMClassifier, self).set_n_features_(n)
[docs] def classify(self, M, raw_score=False, num_iteration=0):
"""
Classify a hyperspectral cube.
Parameters
----------
M : numpy array
A HSI cube (m x n x p).
Returns
-------
numpy array : a class map (m x n x 1)
"""
#from sklearn.preprocessing import LabelEncoder
img = self._convert2D(M)
cls = super(HyperLGBMClassifier, self).predict(img,
raw_score=raw_score, num_iteration=num_iteration)
cmap = self._convert3d(cls, M.shape[0], M.shape[1])
super(HyperLGBMClassifier, self)._set_cmap(cmap)
return self.cmap
[docs] def save(self, fname, n_features, n_classes):
"""
Save the model and is parameters in two files.
When the model is loaded, it instantiate an object of class
HyperLGBMClassifier. See load_lgbm_model function doc.
Parameters
----------
fname : path
The model file name.
n_features : int
The model number of features.
n_classes : int
The model number of classes, ex. for a binary model
n_classes = 2 (the background is a class for pysptools).
"""
meta = {'n_features':n_features, 'n_classes':n_classes}
param_map = self.get_params()
params = {'meta':meta, 'param_map': param_map}
pickle.dump( params, open(fname+'.p', "wb" ))
self.booster_.save_model(fname)
[docs] def plot_feature_importances(self, path, n_labels='all', sort=False, suffix=''):
"""
Plot the feature importances.
The output can be split in n graphs.
Parameters
----------
path : string
The path where to save the plot.
n_labels : string or integer
The number of labels to output by graph. If the value is 'all',
only one graph is generated.
sort : boolean [default False]
If true the feature importances are sorted.
suffix : string [default None]
Add a suffix to the file name.
"""
_plot_feature_importances('HyperLGBM', self.feature_importances_, path,
n_labels=n_labels, sort=sort, suffix=suffix)
[docs] def display_feature_importances(self, n_labels='all', sort=False, suffix=''):
"""
Display the feature importances.
The output can be split in n graphs.
Parameters
----------
n_labels : string or integer
The number of labels to output by graph. If the value is 'all',
only one graph is generated.
sort : boolean [default False]
If true the feature importances are sorted.
suffix : string [default None]
Add a suffix to the file name.
"""
_plot_feature_importances('', self.feature_importances_, None,
n_labels=n_labels, sort=sort, suffix=suffix)