<p>这里有一个简单的解决方案:</p>
<pre><code>import re
def lexicon_by_word(lexicons):
return {word:key for key in lexicons.keys() for word in lexicons[key]}
def split_sentences(st):
sentences = re.split(r'[.?!]\s*', st)
if sentences[-1]:
return sentences
else:
return sentences[:-1]
def ngrams_finder(lexicons, text):
lexicons_by_word = lexicon_by_word(lexicons)
def pattern(lexicons):
pattern = "|".join(lexicons_by_word.keys())
pattern = re.compile(pattern)
return pattern
pattern = pattern(lexicons)
ngrams = []
for sentence in split_sentences(text):
try:
ngram = []
for result in pattern.findall(sentence):
ngram.append([result, lexicons_by_word[result]])
ngrams.append(ngram)
except IndexError: #if re.findall does not find anything
continue
return ngrams
# You could customize it
text = "Yesterday I had a coca cola, and a hot dog for lunch, and some bana split for desert. I liked the coke, but the banana in the banana split dessert was ripe"
lexicons = {
"food":["hot dog",
"banana",
"banana split"],
"beverage":["coke",
"cola",
"coca cola"],
}
print(ngrams_finder(lexicons, text))
</code></pre>
<p>分句函数取自此处:<a href="https://stackoverflow.com/questions/43593428/splitting-a-sentence-by-ending-characters/43596240">Splitting a sentence by ending characters</a></p>