我有一个带有调查数据的CSV文件,我希望对其进行情绪分析
我使用朴素贝叶斯来显示信息量最大的特性,但输出没有显示有意义的洞察力。它输出不相关的单词,如level
或of
,因此我尝试手动创建一个要删除的停止单词列表,但我认为它不能正常工作,因为它们仍然存在。这是我的代码:
import csv
from collections import Counter
import nltk
from nltk.corpus import stopwords
with open('/Users/Alessandra/Desktop/Dissertation Data/Survey Coding Inst.csv', 'r') as f:
reader = csv.reader(f, delimiter='\t')
alist = []
iterreader = iter(reader)
next(iterreader)
c = Counter()
for row in iterreader:
clean_rows = row[0].replace(",", " ").rsplit()
clean_symbols = row[0].replace("-", "").rsplit()
remove_words = ['of', 'Level', 'study', 'How', 'many', 'SC', '2.', '1.', '3.', '4.', '5.', '6.', '7.', '8.',
'9.',
'10.', '11.', '12.', '13.', '14.', '15.', 'Gender', 'inconvenience', 'times', 'Agree',
'Experience', 'Interrupted', 'Workflow', 'Unable', 'Yes', 'No', 'Statement', 'Safety',
'non-UCL', 'people', 'guards', 'Stronglee', 'Disagree', 'Neutral', 'Somewhat', 'on', 'if',
'too', '-', 'i', '1', '2']
# alist.append(clean_rows)
# alist.append(clean_symbols)
c.update(clean_rows)
c.update(clean_symbols)
alist.append(c)
word_count = Counter(c)
mostWcommon = word_count.most_common()
for i in alist:
if i in remove_words:
mostWcommon.remove(i)
print(mostWcommon)
all_words = nltk.FreqDist(w.lower() for w in alist[0])
word_features = list(all_words)[:100]
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
removed_stop_words = []
for review in corpus:
removed_stop_words.append(' '.join([word for word in review[0].split() if word not in english_stop_words]))
return removed_stop_words
no_stop_words = remove_stop_words(mostWcommon)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains {}'.format(word)] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d, c) in mostWcommon]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(5)
OUTPUT:
Most Informative Features
contains i = True 3 : 2 = 1.6 : 1.0
contains 1 = True 1 : 3 = 1.5 : 1.0
contains i = False 2 : 3 = 1.3 : 1.0
contains 2 = True 1 : 3 = 1.2 : 1.0
contains - = True 2 : 1 = 1.2 : 1.0
contains 1 = False 2 : 1 = 1.2 : 1.0
contains 2 = False 2 : 1 = 1.1 : 1.0
contains - = False 1 : 3 = 1.0 : 1.0
contains 5. = False 1 : 4 = 1.0 : 1.0
contains disagree = False 1 : 4 = 1.0 : 1.0
数据如下所示:
('Yes', 194), ('No', 173), ('agree', 61), ('Agree', 57), ('to', 48), ('UG', 47), ('Strongly', 38), ('and', 36), ('unlikely', 36), ('Female', 34), ('-', 34),....)
正如您所看到的,即使是最常见的情况,也不会拾取手动删除,因此显示的数据不太有意义。。。如有任何建议,将不胜感激
目前没有回答
相关问题 更多 >
编程相关推荐