我使用这个dataset来预测employee performance
,使用不同的ML算法,例如DecisionTreeClassifier
、CategoricalNB
、LogisticRegression
、GaussianNB
。这就是数据集的基本结构
df.head(5)
Age DailyRate DistanceFromHome EnvironmentSatisfaction HourlyRate ... EducationField Department BusinessTravel OverTime Over18
0 41 1102 1 2 94 ... 1 2 2 1 0
1 49 279 8 3 61 ... 1 1 1 0 0
2 37 1373 2 4 92 ... 4 1 2 1 0
3 33 1392 3 4 56 ... 1 1 1 1 0
4 27 591 2 1 40 ... 3 1 2 0 0
当我尝试使用
model.score(X_test, y_test)
它提供了几乎100%的准确度(99.98…)
这可能是什么原因
这是我的密码:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing
def readData(path):
dataframe = pd.read_csv(path)
inputs = dataframe.drop(['PerformanceRating',
'Attrition', 'Education', 'EmployeeCount',
'EmployeeNumber',
'StockOptionLevel',
'WorkLifeBalance'
], axis='columns')
# inputs = dataframe.drop(['PerformanceRating'], axis='columns')
target = dataframe['PerformanceRating']
inputs_and_target = pre_processing(inputs, target)
return inputs_and_target
def pre_processing(inputs_, target_):
inputs = inputs_
target = target_
MaritalStatus_ = LabelEncoder()
JobRole_ = LabelEncoder()
Gender_ = LabelEncoder()
EducationField_ = LabelEncoder()
Department_ = LabelEncoder()
BusinessTravel_ = LabelEncoder()
OverTime_ = LabelEncoder()
Over18_ = LabelEncoder()
inputs['MaritalStatus_'] = MaritalStatus_.fit_transform(inputs['MaritalStatus'])
inputs['JobRole_'] = JobRole_.fit_transform(inputs['JobRole'])
inputs['Gender_'] = Gender_.fit_transform(inputs['Gender'])
inputs['EducationField_'] = EducationField_.fit_transform(inputs['EducationField'])
inputs['Department_'] = Department_.fit_transform(inputs['Department'])
inputs['BusinessTravel_'] = BusinessTravel_.fit_transform(inputs['BusinessTravel'])
inputs['OverTime_'] = OverTime_.fit_transform(inputs['OverTime'])
inputs['Over18_'] = Over18_.fit_transform(inputs['Over18'])
inputs.drop(['MaritalStatus', 'JobRole' , 'OverTime' , 'EducationField',
'Gender', 'Department', 'BusinessTravel', 'Over18'], axis='columns', inplace=True)
inputs.rename(columns={'MaritalStatus_':'MaritalStatus',
'JobRole_' : 'JobRole',
'Gender_' : 'Gender',
'EducationField_' : 'EducationField',
'Department_' : 'Department',
'BusinessTravel_' : 'BusinessTravel',
'OverTime_' : 'OverTime',
'Over18_' : 'Over18'}, inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(inputs_)
print(inputs.head(5))
return_ = []
return_.append(X_train_minmax)
return_.append(target)
return return_
def decision_tree_classifier(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# pred = model.predict(X_test)
print(model.score(X_test, y_test))
def naive_bayes_gaussian(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = GaussianNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test) )
def naive_bayes_categorical(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = CategoricalNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
def logistic_regression(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
if __name__ == "__main__":
inputs_and_target = readData("performance.csv")
inputs = inputs_and_target[0]
target = inputs_and_target[1]
print(inputs)
naive_bayes_gaussian(inputs, target)
您处理的数据集可能是高度不平衡的(IBM HR Analytics数据集)。您需要使用
stratified
方法分割测试数据集,也许您可以使用另一个度量来代替accuracy
,例如F1
、recall
或precision
来了解您的模型性能相关问题 更多 >
编程相关推荐