从文件路径中提取人名

import nltk import re def extract_entities(y): #make an empty list to receive results of operation AggPeople = [] #split the filepath by backslashes for y in y.split("\\"): #separate the product above into words, then attach nltk tags (ie. NNP), then attach more specific ntlk tags (ie. Person) for chunk in nltk.ne_chunk(nltk.pos_tag(re.findall(r"[\w]+", y))) : #filter out everything but the person labels if hasattr(chunk, 'label') and chunk.label() == "PERSON": #bring the results of the above into a list AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize()) #filter out words you don't want AggPeople = [x for x in AggPeople if (x not in ['Schedules','Old'])] #get rid of duplicate words with 'set' return set(AggPeople) text = "O:\Country\Province\District\city\Cricket, Jimmy (Y1617F)\Old Schedules\Cricket, Jimmy (78655) Golick doo wop 7 Sept 2016.xlsx" print(extract_entities(text))

import nltk import re from nltk import RegexpParser def extract_entities(y): AggPeople = [] patterns= r"<NP:{<NNP>+}" chunker = RegexpParser(patterns) print(chunker) for y in y.split("\\"): for chunk in chunker(nltk.pos_tag(re.findall(r"[\w]+", y))) : if hasattr(chunk, 'label') and chunk.label() == "PERSON": AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize()) AggPeople = [x for x in AggPeople if (x not in ['Schedules','Old'])] return set(AggPeople)

chunk.RegexpParser with 1 stages: RegexpChunkParser with 1 rules: <ChunkRule: '<NNP>'> Traceback (most recent call last): File "<ipython-input-282-cb323eff63b4>", line 1, in <module> runfile('C:/Users//.spyder-py3/ExtractingNames.py', wdir='C:/Users//.spyder-py3') File "C:\spydercustomize.py", line 827, in runfile execfile(filename, namespace) File "C:\spydercustomize.py", line 110, in execfile exec(compile(f.read(), filename, 'exec'), namespace) File "C:/Users//.spyder-py3/ExtractingNames.py", line 32, in <module> print(extract_entities(text)) File "C:/Users//.spyder-py3/ExtractingNames.py", line 23, in extract_entities for chunk in chunker(nltk.pos_tag(re.findall(r"[\w]+", y))) : TypeError: 'RegexpParser' object is not callable

1条回答

网友

1楼 · 发布于 2024-10-02 04:24:17

#looks for two proper nouns side-by-side
patterns= r"P:{<NNP>{2}}"
chunker = nltk.RegexpParser(patterns)    

def extract_entities(y):
    AggPeople = []
    for y in y.split("\\"):
        #excludes words with digits and schedules
        for chunk in chunker.parse(nltk.pos_tag(re.findall(r"\b(?!Schedules|Old)[^\d\W]+\b", y))) :
            if hasattr(chunk, 'label') and chunk.label() == "P" :
                AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize())
    return set(AggPeople)

text = "O:\Country\Province\District\city\Cricket, Jimmy (Y1617F)\Old Schedules\Cricket, Jimmy (78655) Golick doo wop 7 Sept 2016.xlsx"

print(extract_entities(text))

如果将chunker放在循环之外，可以使代码运行得更快（否则，它会随着循环的每次迭代而重新生成）
如果您正在查找人名，并且通常有两个人名（第一个和最后一个），则可以在模式中使用{2}符号指定两个NNP
您可以使用负前瞻排除正则表达式中的某些单词，并使用^\d排除其中包含数字的单词

相关问题更多 >

编程相关推荐

热门问题

热门文章