我正在关注一篇关于Python中自然语言处理的tutorial。我正在以一种我觉得更容易阅读的方式构建代码
class Datasets(object):
labeled_training_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\labeledTrainData.tsv\labeledTrainData.tsv",
header=0,
delimiter="\t",
quoting=3)
test_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\testData.tsv\testData.tsv",
header=0,
delimiter="\t",
quoting=3)
unlabeled_training_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\unlabeledTrainData.tsv\unlabeledTrainData.tsv",
header=0,
delimiter="\t",
quoting=3)
class TextCleaner(object):
@classmethod
def convert_to_word_list(html_text, remove_stopwords = False):
cleaned_text = BeautifulSoup(html_text).get_text()
cleaned_text = re.sub("[^a-zA-Z]", " ", cleaned_text)
all_words = cleaned_text.lower().split()
if remove_stopwords:
stop_words = set(stopwords.words("english"))
remaining_words = [word for word in all_words if not word in stop_words]
return(all_words)
@classmethod
def __splitIntoSentences(text, tokenizer, remove_stopwords = False):
sentences_in_text = tokenizer.tokenize(text.strip())
non_empty_sentences = []
for sentence in sentences_in_text:
if len(sentence) > 0:
non_empty_sentences.append(__convertToWordList(sentence, remove_stopwords))
return non_empty_sentences
@classmethod
def get_all_sentences_from_training_sets(tokenizer):
all_sentences = []
print "Parsing sentences from the labeled training set..."
for review in Datasets.labeled_training_data["review"]:
all_sentences.append(TextCleaner.__splitIntoSentences(review, tokenizer))
print "Parsing sentences from the unlabeled training set..."
for review in Datasets.unlabeled_training_data["review"]:
all_sentences.append(TextCleaner.__splitIntoSentences(review, tokenizer))
print "There are %d sentences in our training sets." % len(all_sentences)
return all_sentences
当我在REPL中运行这两行时,问题出现了:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
all_sentences = TextCleaner.get_all_sentences_from_training_sets(tokenizer)
我收到以下错误消息:
TypeError: getAllSentencesFromTrainingData() takes exactly 1 argument (2 given)
我已经在documentation中读到了关于nltk.data.load()
的内容,但是对于导致错误消息的原因,我仍然一无所知
救命啊
classmethod
的第一个参数应该是cls
,它将由调用方法的特定类填充(编辑:由Python运行时,而不是由您)不过,从pythonic的角度来看,这些函数实际上可能是模块级函数,因为它们没有以任何方式使用类的机制
相关问题 更多 >
编程相关推荐