xgboost中的Poisson回归在低频下失败

import numpy as np import pandas as pd import xgboost as xgb def get_preds(mult): # generate toy dataset for illustration # 4 observations with linearly increasing frequencies # the frequencies are scaled by `mult` dmat = xgb.DMatrix(data=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), label=[i*mult for i in [1, 2, 3, 4]], weight=[1000, 1000, 1000, 1000]) # train a poisson booster on the toy data bst = xgb.train( params={"objective": "count:poisson"}, dtrain=dmat, num_boost_round=100000, early_stopping_rounds=5, evals=[(dmat, "train")], verbose_eval=False) # return fitted frequencies after reversing scaling return bst.predict(dmat)/mult # test multipliers in the range [10**(-8), 10**1] # display fitted frequencies mults = [10**i for i in range(-8, 1)] df = pd.DataFrame(np.round(np.vstack([get_preds(m) for m in mults]), 0)) df.index = mults df.columns = ["(0, 0)", "(0, 1)", "(1, 0)", "(1, 1)"] df # --- result --- # (0, 0) (0, 1) (1, 0) (1, 1) #1.000000e-08 11598.0 11598.0 11598.0 11598.0 #1.000000e-07 1161.0 1161.0 1161.0 1161.0 #1.000000e-06 118.0 118.0 118.0 118.0 #1.000000e-05 12.0 12.0 12.0 12.0 #1.000000e-04 2.0 2.0 3.0 3.0 #1.000000e-03 1.0 2.0 3.0 4.0 #1.000000e-02 1.0 2.0 3.0 4.0 #1.000000e-01 1.0 2.0 3.0 4.0 #1.000000e+00 1.0 2.0 3.0 4.0

1条回答

网友

1楼 · 发布于 2024-09-28 22:23:15

经过一番挖掘，我找到了解决办法。在这里记录，以防其他人遇到相同的问题。结果我需要加上一个偏移项，等于平均频率的（自然）对数。如果这还不是很明显，那是因为初始预测从0.5的频率开始，并且需要许多增强迭代来将预测重新调整到平均频率。在

请参阅下面的代码以获取对玩具示例的更新。正如我在最初的问题中建议的那样，现在的预测在较低的尺度上接近平均频率（2.5）。在

import numpy as np
import pandas as pd
import xgboost as xgb

def get_preds(mult):
    # generate toy dataset for illustration
    # 4 observations with linearly increasing frequencies
    # the frequencies are scaled by `mult`
    dmat = xgb.DMatrix(data=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                       label=[i*mult for i in [1, 2, 3, 4]],
                       weight=[1000, 1000, 1000, 1000])

    ## adding an offset term equal to the log of the mean frequency
    offset = np.log(np.mean([i*mult for i in [1, 2, 3, 4]]))
    dmat.set_base_margin(np.repeat(offset, 4))

    # train a poisson booster on the toy data
    bst = xgb.train(
        params={"objective": "count:poisson"},
        dtrain=dmat,
        num_boost_round=100000,
        early_stopping_rounds=5,
        evals=[(dmat, "train")],
        verbose_eval=False)

    # return fitted frequencies after reversing scaling
    return bst.predict(dmat)/mult

# test multipliers in the range [10**(-8), 10**1]
# display fitted frequencies 
mults = [10**i for i in range(-8, 1)]
## round to 1 decimal point to show the result approaches 2.5
df = pd.DataFrame(np.round(np.vstack([get_preds(m) for m in mults]), 1))
df.index = mults
df.columns = ["(0, 0)", "(0, 1)", "(1, 0)", "(1, 1)"]
df

#  - result  -
#              (0, 0)  (0, 1)  (1, 0)  (1, 1)
#1.000000e-08     2.5     2.5     2.5     2.5
#1.000000e-07     2.5     2.5     2.5     2.5
#1.000000e-06     2.5     2.5     2.5     2.5
#1.000000e-05     2.5     2.5     2.5     2.5
#1.000000e-04     2.4     2.5     2.5     2.6
#1.000000e-03     1.0     2.0     3.0     4.0
#1.000000e-02     1.0     2.0     3.0     4.0
#1.000000e-01     1.0     2.0     3.0     4.0
#1.000000e+00     1.0     2.0     3.0     4.0

相关问题更多 >

编程相关推荐

热门问题

热门文章