如何在python中根据10倍交叉验证创建或编辑train和test split

2024-09-30 20:33:55 发布

您现在位置:Python中文网/ 问答频道 /正文

我尝试在Python中使用集成学习,我想我不能将train和testsplit分开进行10倍交叉验证……有人知道Python中的这些问题吗?你知道吗

另外,我想看到每个迭代结果和标准偏差的输出。。。。你知道吗

我正在macOS上的Jupyter笔记本中使用python2.7.15 Anaconda

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import    AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn import metrics





df = pd.read_csv('/Users/arbade/Desktop/Datasets/realData.csv',encoding="utf-8")


X = df.drop(columns = ['mobileOp'])
y = df['mobileOp']

seed = 42
num_trees = 25
kfold = model_selection.KFold(n_splits=10, random_state=seed)


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state = seed)



knn = KNeighborsClassifier(n_neighbors=3,metric='minkowski')



params_knn = {"n_neighbors": np.arange(1, 50)}

knn_gs = GridSearchCV(knn, params_knn, cv=kfold,iid=False)
knn_gs.fit(X_train, y_train)
knn.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
prediction = knn.predict(X_test)
print(prediction)


rf = RandomForestClassifier(n_estimators=num_trees, random_state=seed,max_features="sqrt")

params_rf = {"n_estimators": [50,100]}

rf_gs = GridSearchCV(rf, params_rf, cv=kfold)
rf_gs.fit(X_train, y_train)


rf_best = rf_gs.best_estimator_

print(rf_gs.best_params_)




adaBoost = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)

params_adaBoost = {"n_estimators": [50,100]}

adaBoost_gs = GridSearchCV(adaBoost, params_adaBoost, cv=kfold,iid=False)
adaBoost_gs.fit(X_train, y_train)


adaBoost_best = adaBoost_gs.best_estimator_


adaBoost.fit(X_train,y_train)



grBoost=GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
params_grBoost = {"n_estimators": [50,100]}

grBoost_gs = GridSearchCV(grBoost, params_grBoost, cv=kfold,iid=False)
grBoost_gs.fit(X_train, y_train)


grBoost_best = grBoost_gs.best_estimator_

grBoost.fit(X_train,y_train)



mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,15), random_state=seed)
mlp.fit(X_train,y_train)
 dtc=DecisionTreeClassifier(max_depth=10,random_state=seed,criterion='entropy')
dtc.fit(X_train,y_train)
svc = SVC(gamma='scale', kernel='rbf', probability=True,random_state=seed)
svc.fit(X_train,y_train)
nb=MultinomialNB()
nb.fit(X_train,y_train)
log_reg = LogisticRegression(penalty='l1',multi_class='multinomial',solver='saga',max_iter=100,C=1e5,random_state=seed,dual=False,intercept_scaling=1,verbose=0,n_jobs=3,class_weight=None)




 print("KNN Classifier: {}".format(knn_best.score(X_test, y_test)))

 print("Random Forest: {}".format(rf_best.score(X_test, y_test)))

 print("Logistic Regression: {}".format(log_reg.score(X_test, y_test)))

 print("SVC Classifier: {}".format(svc.score(X_test, y_test)))

 print("Naive-Bayes Classifier: {}".format(nb.score(X_test, y_test)))

 print("Desicion-Tree: {}".format(dtc.score(X_test, y_test)))

 print("Multi-Layer Perceptron: {}".format(mlp.score(X_test, y_test)))

 print("AdaBoost: {}".format(adaBoost_best.score(X_test, y_test)))

 print("GradientBoosting Classifier: {}".format(grBoost_best.score(X_test, y_test)))





estimators=[("knn", knn_best), ("rf", rf_best), ("log_reg", log_reg),("nb",nb),("svc",svc),("dtc",dtc),("mlp",mlp),("adaBoost",adaBoost_best),('grBoost',grBoost_best)]

ensemble = VotingClassifier(estimators, voting="hard")



ensemble.fit(X_train, y_train)

a=ensemble.score(X_test, y_test)
ensPred=ensemble.predict(X_test)
results = model_selection.cross_val_score(ensemble,X,y,cv=kfold)


 print("Accuracy: %0.2f (+/- %0.2f)" % (results.mean(), results.std()))
 print("std: %",results.std()*100)


 accuracy_score(y_test,ensPred)

 #print('Ensemble Score: ' + repr(a) + '%')
 print('Average Score:'+repr(results.mean()*100)+'%')
 print(classification_report(y_test,ensPred))
 #print(accuracy_score(y_test,ensPred))
 #print("Accuracy:",metrics.accuracy_score(y_test, ensPred))

还有一些预期结果:

KNN:[](Each iteration for 10 fold,Accuracy and std)
Random-Forest:[](Each iteration for 10 fold,Accuracy and std)
.
.
.
Multi-Layer Perceptron:[](Each iteration for 10 fold,Accuracy and std)
Ensemble Score:[](Each iteration for 10 fold,Accuracy and std)

Tags: fromtestimportgstrainsklearnfitseed