训练数据集和测试数据集因kfold交叉验证而发生变化,因此朴素贝叶斯分类器的精度发生变化

2024-05-02 11:54:06 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图使用来自here的朴素贝叶斯分类器代码。 我对数据集使用5次折叠。问题是,测试和训练数据集在每次折叠时都会发生变化,因此每次执行时的精度也会发生变化。但我需要一个固定精度的结果。我试图使用一些示例数据集获得结果。 我的Jupyter代码如下:

import numpy as np
from random import randrange
import csv
import math
import codecs

# Returns the mean of numbers
def mean(numbers):
    
    return np.mean(numbers)

#Returns the std_deviation of numbers
def stdev(numbers):

    return np.std(numbers)  

#Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
   
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

#Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, n_folds, ):
   
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, )
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores, predicted

#Split training set by class value
def separate_by_class(dataset):

    separated = {}
    for i in range(len(dataset)):
        row = dataset[i]
        if row[-1] not in separated:
            separated[row[-1]] = []
        separated[row[-1]].append(row)
    return separated

#Find the mean and standard deviation of each feature in dataset
def model(dataset):
   
    models = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    models.pop() #Remove last entry because it is class value.
    return models

#find the mean and standard deviation of each feature in dataset by their class
def model_by_class(dataset):
    
    separated = separate_by_class(dataset)
    class_models = {}
    for (classValue, instances) in separated.items():
        class_models[classValue] = model(instances)
    return class_models

#Calculate probability using gaussian density function
def calculate_pdf(x, mean, stdev):
    
    if stdev == 0.0:
        if x == mean:
            return 1.0
        else:
            return 0.0
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return 1 / (math.sqrt(2 * math.pi) * stdev) * exponent

#Calculate the class probability for input sample. Combine probability of each feature
def calculate_class_probabilities(models, input):
    
    probabilities = {}
    for (classValue, classModels) in models.items():
        probabilities[classValue] = 1
        for i in range(len(classModels)):
            (mean, stdev) = classModels[i]
            x = input[i]
            probabilities[classValue] *= calculate_pdf(x, mean, stdev)
    return probabilities

#Compare probability for each class. Return the class label which has max probability.
def predict(models, inputVector):
    
    probabilities = calculate_class_probabilities(models, inputVector)
    (bestLabel, bestProb) = (None, -1)
    for (classValue, probability) in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

#Get class label for each value in test set.
def getPredictions(models, testSet):

    predictions = []
    for i in range(len(testSet)):
        result = predict(models, testSet[i])
        predictions.append(result)
    return predictions

#Create a naive bayes model. Then test the model and returns the testing result.
def naive_bayes(train, test, ):
    
    summaries = model_by_class(train)
    predictions = getPredictions(summaries, test)
    return predictions

# load and prepare data for result
dataset =[[1, 20, 1],
          [2, 21, 0],
          [3, 22, 1],
          [4, 22, 0],
          [5, 20, 0],
          [6, 20, 1],
          [7, 21, 0],
          [8, 22, 1],
          [9, 22, 0],
          [10, 20, 1]]
      
n_folds = 5
print ("---------- Gaussian Naive Bayes ---------------")
accuracy_naive = evaluate_algorithm(dataset, naive_bayes, n_folds)
print ("Naive Bayes Classification")
print ('Accuracy in each fold: %s' % accuracy_naive)
print ('Average Accuracy: %f' % (sum(accuracy_naive) / len(accuracy_naive)))

我试着用样本数据做测试。我认为问题在于:

# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split



#Test splitting data
dataset = [[1, 20, 1],
           [2, 21, 0],
           [3, 22, 1],
           [4, 22, 0],
           [5, 20, 0],
           [6, 20, 1],
           [7, 21, 0],
           [8, 22, 1],
           [9, 22, 0],
           [10, 20, 1]
           ]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split

先谢谢你


Tags: theinforlenreturnmodelsdeffold
2条回答

在randrange之前,对随机数进行种子设定,以便在每次执行中遵循相同的分割

因此,您可以按如下方式更改代码

import random

# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    random.seed(0)
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split



#Test splitting data
dataset = [[1, 20, 1],
           [2, 21, 0],
           [3, 22, 1],
           [4, 22, 0],
           [5, 20, 0],
           [6, 20, 1],
           [7, 21, 0],
           [8, 22, 1],
           [9, 22, 0],
           [10, 20, 1]
           ]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split 

@Amesh Jayaweera提供的答案是正确的,但我想告诉您,sklearn中有一个预定义的函数,如下所示

from sklearn.model_selection import StratifiedKfold
splitter = StratifiedKfold(n_splits=5, random_state=1234)

这是一种更好的方式,实现了优雅的实现,并且具有分层折叠的额外优势。此外,随机_状态是种子。您可以在线检查它的实现

相关问题 更多 >