sklearn Boosting:交叉验证,无需每次重新启动就可以找到最佳估计数

2024-10-03 02:42:21 发布

 for n in [50,100,150,200,250,300]:
     model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=n) 


from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier # We will use simple stumps for individual estimators in AdaBoost.
from sklearn.metrics import accuracy_score

import numpy as np
import matplotlib.pyplot as plt


nSamples = {'train' : 2000, 'test' : 1000}

X = np.random.uniform(size = (nSamples['train'] + nSamples['test'], 2))

# Decision boundary is the unit circle.
in_class = X[:, 0]**2 + X[:, 1]**2 > 1
y = np.zeros(len(X), dtype = int)
y[in_class] = 1

# Add some random error.
error_rate = 0.01
to_flip = np.random.choice(np.arange(len(y)), size = int(error_rate * len(y)), replace = False)
y[to_flip] = 1 - y[to_flip]

# Split training and test.
X = {'train' : X[:nSamples['train']],
     'test' : X[nSamples['train']:]}
y = {'train' : y[:nSamples['train']],
     'test' : y[nSamples['train']:]}

# Make AdaBoost Classifier.
max_estimators = 50
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1, # Just a stump.
                                      random_state = np.random.RandomState(0)),
                               n_estimators = max_estimators,
                               random_state = np.random.RandomState(0))

# Fit all estimators.['train'], y['train'])

# Get the test accuracy for each stage of prediction.
scores = {'train' : [], 'test' : []}

for y_predict_train, y_predict_test in zip(ada_boost.staged_predict(X['train']),
    scores['train'].append(accuracy_score(y['train'], y_predict_train))
    scores['test'].append(accuracy_score(y['test'], y_predict_test))

# Plot the results.
n_estimators = range(1, len(scores['train']) + 1)
for key in scores.keys():
    plt.plot(n_estimators, scores[key])
plt.title('Staged Scores')
plt.xlabel('N Estimators')





Make a "hack" of AdaBoostClassifier in
that doesn't need to retrain estimators and is compatible with many sklearn
cross validation functions.

import copy 
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.base import clone

# Used to hold important variables between runs of cross validation.
# Note that sklearn cross validation functions use sklearn.base.clone()
# to make copies of the estimator sent to it as a function. The function
# sklearn.base.clone() makes deep copies of parameters of an estimator, so
# the only way to provide a way to remember previous estimators between
# cross validation runs is to use a global variable.
# We will use hash values of the split of X[:, 0] as keys for remembering
# previous estimators of a cv fold. Note, you can NOT use cross validators
# that randomly shuffle the data before splitting. This will cause different
# hashes.

kfold_hash = {}

class WarmRestartAdaBoostClassifier(AdaBoostClassifier):
    Keep track of old estimators, estimator weights, the estimator errors, and
    the next to last sample weight seen.

    Note that AdaBoostClassifier._boost() does NOT boost the last seen sample
    weight. Simple fix to this is to drop the last estimator and retrain it.

    Wrap to decide whether to throw away estimators or add estimators 
    depending on the current number of estimators vs the number of old esimators.
    Also look at the possibility of use the global kfold_hash to get old values if
    use_kfold_hash == True.

    Wrap AdaBoostClassifier._boost() with behavior to record the next to last sample weight.
    def __init__(self,
                 next_to_last_sample_weight = None,
                 old_estimators_ = [],
                 use_kfold_hash = False):

        AdaBoostClassifier.__init__(self, base_estimator, n_estimators, learning_rate,
                                          algorithm, random_state)

        self.next_to_last_sample_weight = next_to_last_sample_weight 
        self._last_sample_weight = None
        self.old_estimators_ = old_estimators_ 
        self.use_kfold_hash = use_kfold_hash 

    def _boost(self, iboost, X, y, sample_weight, random_state):
        Record the sample weight.

        Parameters and return behavior same as that of AdaBoostClassifier._boost() as
        seen in

               iboost : int
                   The index of the current boost iteration.
               X : {array-like, sparse matrix} of shape = [n_samples, n_features]
                   The training input samples. Sparse matrix can be CSC, CSR, COO,
                   DOK, or LIL. COO, DOK, and LIL are converted to CSR.
               y : array-like of shape = [n_samples]
                   The target values (class labels).
               sample_weight : array-like of shape = [n_samples]
                   The current sample weights.
               random_state : RandomState
                   The current random number generator
               sample_weight : array-like of shape = [n_samples] or None
                   The reweighted sample weights.
                   If None then boosting has terminated early.
               estimator_weight : float
                   The weight for the current boost.
                   If None then boosting has terminated early.
               error : float
                   The classification error for the current boost.
                   If None then boosting has terminated early.
        fit_info = AdaBoostClassifier._boost(self, iboost, X, y, sample_weight, random_state) 
        sample_weight, _, _ = fit_info
        self.next_to_last_sample_weight = self._last_sample_weight
        self._last_sample_weight = sample_weight
        return fit_info

    def fit(self, X, y):

        hash_X = None
        if self.use_kfold_hash:
            # Use a hash of X features in this kfold to access the global information
            # for this kfold.
            hash_X = hash(bytes(X[:, 0]))
            if hash_X in kfold_hash.keys():
                self.old_estimators_ = kfold_hash[hash_X]['old_estimators_']
                self.next_to_last_sample_weight = kfold_hash[hash_X]['next_to_last_sample_weight']
                self.estimator_weights_ = kfold_hash[hash_X]['estimator_weights_']
                self.estimator_errors_ = kfold_hash[hash_X]['estimator_errors_']

        # We haven't done any fits yet.
        if not self.old_estimators_:
  , X, y)
            self.old_estimators_ = self.estimators_

        # The case that we throw away estimators.
        elif self.n_estimators < len(self.old_estimators_):
            self.estimators_ = self.old_estimators_[:self.n_estimators]
            self.estimator_weights_ = self.estimator_weights_[:self.n_estimators] 
            self.estimator_errors_ = self.estimator_errors_[:self.n_estimators]

        # The case that we add new estimators.
        elif self.n_estimators > len(self.old_estimators_): 
            n_more = self.n_estimators - len(self.old_estimators_)
            self.fit_more(X, y, n_more)

        # Record information in the global hash if necessary.
        if self.use_kfold_hash:
            kfold_hash[hash_X] = {'old_estimators_' : self.old_estimators_,
                                  'next_to_last_sample_weight' : self.next_to_last_sample_weight,
                                  'estimator_weights_' : self.estimator_weights_,
                                  'estimator_errors_' : self.estimator_errors_}

        return self

    def fit_more(self, X, y, n_more):
        Fits additional estimators.
        # Since AdaBoostClassifier._boost() doesn't boost the last sample weight, we retrain the last estimator with
        # its input sample weight.
        self.n_estimators = n_more + 1

        if self.old_estimators_ is None:
            raise Exception('Should have already fit estimators before calling fit_more()')
        self.old_estimators_ = self.old_estimators_[:-1]

        old_estimator_weights = self.estimator_weights_[:-1]
        old_estimator_errors = self.estimator_errors_[:-1]
        sample_weight = self.next_to_last_sample_weight , X, y, sample_weight)

        self.estimators_ = self.old_estimators_

        self.n_estimators = len(self.estimators_)
        self.estimator_weights_ = np.concatenate([old_estimator_weights, self.estimator_weights_])
        self.estimator_errors_ = np.concatenate([old_estimator_errors, self.estimator_errors_])



