MaxEnt分类器每次NLTK都返回相同的概率

2024-09-24 02:19:52 发布

您现在位置:Python中文网/ 问答频道 /正文

我尝试使用NLTK库使用maxent分类器。我有一个肯定词和否定词的列表,并且我训练了分类器。问题是,当我用一个句子测试分类器时,两个类的分类概率总是相同的。这是密码-

import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator

from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews


def getBestWords(posWords,negWords):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1

    for word in negWords:
        word_fd[word.lower()] += 1
        label_word_fd['neg'][word.lower()] += 1

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score


    sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
    bestwords = set([w for w,s in sorted_x])

    return bestwords

def best_word_feats(words,bestwords):
    return dict([(word, True) for word in words if word in bestwords])

def word_feats(words):
    return dict([(word, True) for word in words])

def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    bestwords = getBestWords(posWords,negWords)
    d.update(best_word_feats(words,bestwords))
    return d

posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
    spamreader = csv.reader(csvfile)
    posWords = list(spamreader)

with open('../data/finalSentiNegWords.csv','r') as csvfile:
    spamreader = csv.reader(csvfile)
    negWords = list(spamreader)

posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]

bestwords = getBestWords(posWords,negWords)

posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]


trainfeats = negfeats + posfeats

algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))

结果是-

^{pr2}$

总体分类似乎运行良好,但我不明白概率是如何计算的,以及为什么即使我改变了测试语句,它们总是相同的。在

感谢任何快速的帮助。在

谢谢。在


Tags: infromposimportforcountlabelword
1条回答
网友
1楼 · 发布于 2024-09-24 02:19:52

代码太多了!注意,你的语料库里并不是所有的单词。如果这个名字没有误导的话。在

相关问题 更多 >