层理排列的奇怪结果

from sklearn.model_selection import StratifiedShuffleSplit def stratifid(df, target, test_sz = 0.2): split = StratifiedShuffleSplit(n_splits = 1, test_size = test_sz, random_state = 42) for tr_idx, te_idx in split.split(df, df[target]): train = df.loc[tr_idx] test = df.loc[te_idx] return train, test df = pd.DataFrame(data = { 'gender' : [1, 1, 0, 1, 1, 0, 0, 0, 1, 0, ], 'age' : [13, 45, 1, 45, 15, 16, 16, 16, 15, 15], 'cholesterol' : [1, 2, 2, 1, 1, 1, 1, 1, 1, 1], 'smoke' : [0, 0, 1, 1, 7, 8, 3, 4, 4, 2]}, dtype = np.int64) df1 = df.loc[df['age'] > 13] X_train, X_test = stratifid(df1, ['gender'], 0.2) print(X_train) I expect correct stratification for data. But my output is following: gender age cholesterol smoke 0 NaN NaN NaN NaN 4 1.0 15.0 1.0 7.0 1 1.0 45.0 2.0 0.0 6 0.0 16.0 1.0 3.0 3 1.0 45.0 1.0 1.0 7 0.0 16.0 1.0 4.0 Nan values are not expected ones.... If I make stratification for whole df( when df1 = df) all are Ok. What I'm doing wrong?

1条回答

网友

1楼 · 发布于 2024-09-29 20:27:16

from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

def stratifid(df, target, test_sz = 0.2):
 split = StratifiedShuffleSplit(n_splits = 1, test_size  = test_sz, random_state = 42)
 for tr_idx, te_idx in split.split(df, df[target]):
   train = df.loc[tr_idx]
   test  = df.loc[te_idx]
 return train, test

df = pd.DataFrame(data = {
    'gender' :      [1,  1,  0, 1,  1,  0,  0,  0,  1,  0, ],
    'age' :         [13, 45, 1, 45, 15, 16, 16, 16, 15, 15],
    'cholesterol' : [1,  2,  2, 1, 1, 1, 1, 1, 1, 1],
    'smoke' :       [0,  0,  1, 1, 7, 8, 3, 4, 4, 2]},
     dtype = np.int64)

df1 = df.loc[df['age'] > 13]
df1.reset_index(inplace=True)
del df1['index']

X_train, X_test = stratifid(df1, ['gender'], 0.2)
print(X_train)

相关问题更多 >

编程相关推荐

热门问题

热门文章