import nltk
from nltk import pos_tag
from nltk import tokenize
def extract_phrases(my_tree, phrase):
my_phrases = []
if my_tree.label() == phrase:
my_phrases.append(my_tree.copy(True))
for child in my_tree:
if type(child) is nltk.Tree:
list_of_phrases = extract_phrases(child, phrase)
if len(list_of_phrases) > 0:
my_phrases.extend(list_of_phrases)
return my_phrases
def main():
sentences = ["My favorite game is call of duty"]
grammar = "NP: {<DT>?<JJ>*<NN>|<NNP>*}"
cp = nltk.RegexpParser(grammar)
for x in sentences:
sentence = pos_tag(tokenize.word_tokenize(x))
tree = cp.parse(sentence)
print "\nNoun phrases:"
list_of_noun_phrases = extract_phrases(tree, 'NP')
for phrase in list_of_noun_phrases:
print phrase, "_".join([x[0] for x in phrase.leaves()])
if __name__ == "__main__":
main()
当然,这对行动计划来说太晚了,但我想我应该把这个答案留给其他人:
听起来你可能真正要问的是:我如何确保像“使命召唤”这样的复合短语组合在一起成为一种象征?在
可以使用nltk的多字表达式标记器,如下所示:
其中
mwe
表示多单词表达式。tokenized_string
的值将是['My', 'favorite', 'game', 'is', 'call of duty']
我想你想要的是关键词提取,你可以这样做,比如先用词性标签标记每个词,然后在词性标签上应用某种正则表达式,把感兴趣的单词连接成关键词短语。在
这将输出以下内容:
^{pr2}$但是,你可以到处玩
尝试其他类型的表达式,以便根据要连接到一起的单词/标记准确地获得所需的内容。在
如果你感兴趣的话,也可以看看这篇关于关键词/单词提取的非常好的介绍:
https://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
相关问题 更多 >
编程相关推荐