如何在spacy nlp中添加新的实体（ORG）实例

from spacy.en import English import spacy.en from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63 import os import csv import sys nlp = English() #Load everything for the English model print "Before nlp vocab length", len(nlp.matcher.vocab) symbol_list = [u"CHK", u"JONE", u"NE", u"DO", u"ESV"] txt = u"""drive double-digit rallies in Chesapeake Energy (NYSE: CHK), (NYSE: NE), (NYSE: DO), (NYSE: ESV), (NYSE: JONE)"""# u"""Drive double-digit rallies in Chesapeake Energy (NYSE: CHK), Noble Corporation (NYSE:NE), Diamond Offshore (NYSE:DO), Ensco (NYSE:ESV), and Jones Energy (NYSE: JONE)""" before = nlp(txt) for tok in before: #Before adding entities print tok, tok.orth, tok.tag_, tok.ent_type_ for symbol in symbol_list: print "adding symbol:", symbol print "vocab length:", len(nlp.matcher.vocab) print "pattern length:", nlp.matcher.n_patterns nlp.matcher.add(symbol, u'ORG', {}, [[{u'orth': symbol}]]) print "Patterns:", nlp.matcher._patterns print "Entities:", nlp.matcher._entities for ent in nlp.matcher._entities: print ent.label tokens = nlp(txt) print "\n\nAfter:" print "After nlp vocab length", len(nlp.matcher.vocab) for tok in tokens: print tok, tok.orth, tok.tag_, tok.ent_type_

1条回答

网友

1楼 · 发布于 2024-09-22 14:39:50

以下是基于docs的工作示例：

import spacy

nlp = spacy.load('en')

def merge_phrases(matcher, doc, i, matches):
    '''
    Merge a phrase. We have to be careful here because we'll change the token indices.
    To avoid problems, merge all the phrases once we're called on the last match.
    '''
    if i != len(matches)-1:
        return None
    spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
    for ent_id, label, span in spans:
        span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])

matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add(entity_key='stock-nyse', label='STOCK', attrs={}, specs=[[{spacy.attrs.ORTH: 'NYSE'}]], on_match=merge_phrases)
matcher.add(entity_key='stock-esv', label='STOCK', attrs={}, specs=[[{spacy.attrs.ORTH: 'ESV'}]], on_match=merge_phrases)
doc = nlp(u"""drive double-digit rallies in Chesapeake Energy (NYSE: CHK), (NYSE: NE), (NYSE: DO), (NYSE: ESV), (NYSE: JONE)""")
matcher(doc)
print(['%s|%s' % (t.orth_, t.ent_type_) for t in doc])

^{pr2}$

NYSE和ESV现在用STOCK实体类型标记。基本上，在每个匹配中，您应该手动合并令牌和/或分配所需的实体类型。还有一个acceptor函数，允许您在匹配匹配时过滤/拒绝匹配项。在

相关问题更多 >

编程相关推荐

热门问题

热门文章