为什么我得到了近乎完美的测试精度?

2024-10-05 10:13:36 发布

您现在位置:Python中文网/ 问答频道 /正文

我使用这个dataset来预测employee performance,使用不同的ML算法,例如DecisionTreeClassifierCategoricalNBLogisticRegressionGaussianNB。这就是数据集的基本结构

df.head(5)

   Age  DailyRate  DistanceFromHome  EnvironmentSatisfaction  HourlyRate  ...  EducationField  Department  BusinessTravel  OverTime  Over18
0   41       1102                 1                        2          94  ...               1           2               2         1       0
1   49        279                 8                        3          61  ...               1           1               1         0       0
2   37       1373                 2                        4          92  ...               4           1               2         1       0
3   33       1392                 3                        4          56  ...               1           1               1         1       0
4   27        591                 2                        1          40  ...               3           1               2         0       0

当我尝试使用

model.score(X_test, y_test)

它提供了几乎100%的准确度(99.98…)

这可能是什么原因

这是我的密码:

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing

def readData(path):

    dataframe = pd.read_csv(path)

    inputs = dataframe.drop(['PerformanceRating', 
                            'Attrition', 'Education', 'EmployeeCount',
                            'EmployeeNumber',
                            'StockOptionLevel',
                            'WorkLifeBalance'
                            ], axis='columns')

    # inputs = dataframe.drop(['PerformanceRating'], axis='columns')                        

    target = dataframe['PerformanceRating']
    inputs_and_target = pre_processing(inputs, target)
    return inputs_and_target


def pre_processing(inputs_, target_):

    inputs = inputs_
    target = target_

    MaritalStatus_ = LabelEncoder()
    JobRole_ = LabelEncoder()
    Gender_ = LabelEncoder()
    EducationField_ = LabelEncoder()
    Department_ = LabelEncoder()
    BusinessTravel_ = LabelEncoder()

    OverTime_ = LabelEncoder()
    Over18_ = LabelEncoder()


    inputs['MaritalStatus_'] = MaritalStatus_.fit_transform(inputs['MaritalStatus'])
    inputs['JobRole_'] = JobRole_.fit_transform(inputs['JobRole'])
    inputs['Gender_'] = Gender_.fit_transform(inputs['Gender'])
    inputs['EducationField_'] = EducationField_.fit_transform(inputs['EducationField'])
    inputs['Department_'] = Department_.fit_transform(inputs['Department'])
    inputs['BusinessTravel_'] = BusinessTravel_.fit_transform(inputs['BusinessTravel'])

    inputs['OverTime_'] = OverTime_.fit_transform(inputs['OverTime'])
    inputs['Over18_'] = Over18_.fit_transform(inputs['Over18'])


    inputs.drop(['MaritalStatus', 'JobRole' , 'OverTime' , 'EducationField',
                    'Gender', 'Department', 'BusinessTravel', 'Over18'], axis='columns', inplace=True)


    inputs.rename(columns={'MaritalStatus_':'MaritalStatus', 
                        'JobRole_' : 'JobRole',
                        'Gender_' : 'Gender',
                        'EducationField_' : 'EducationField',
                        'Department_' : 'Department',
                        'BusinessTravel_' : 'BusinessTravel',

                        'OverTime_' : 'OverTime',
                        'Over18_' : 'Over18'}, inplace=True)


    min_max_scaler = preprocessing.MinMaxScaler()
    X_train_minmax = min_max_scaler.fit_transform(inputs_)    

    print(inputs.head(5))
    return_ = []
    return_.append(X_train_minmax)
    return_.append(target)
    return return_



def decision_tree_classifier(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    # pred = model.predict(X_test)

    print(model.score(X_test, y_test))


def naive_bayes_gaussian(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = GaussianNB()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test) )


def naive_bayes_categorical(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = CategoricalNB()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))


def logistic_regression(inputs_, target_):
    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = LogisticRegression(multi_class="ovr")
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))


if __name__ == "__main__":

    inputs_and_target = readData("performance.csv")

    inputs = inputs_and_target[0]
    target = inputs_and_target[1]

    print(inputs)


    naive_bayes_gaussian(inputs, target)

Tags: fromtestimporttargetmodeltransformtrainsklearn
1条回答
网友
1楼 · 发布于 2024-10-05 10:13:36

您处理的数据集可能是高度不平衡的(IBM HR Analytics数据集)。您需要使用stratified方法分割测试数据集,也许您可以使用另一个度量来代替accuracy,例如F1recallprecision来了解您的模型性能

相关问题 更多 >

    热门问题