import re
class RegExConcordanceIndex(object):
"Class to mimic nltk's ConcordanceIndex.print_concordance."
def __init__(self, text):
self._text = text
def print_concordance(self, regex, width=80, lines=25, demarcation=''):
"""
Prints n <= @lines contexts for @regex with a context <= @width".
Make @lines 0 to display all matches.
Designate @demarcation to enclose matches in demarcating characters.
"""
concordance = []
matches = re.finditer(regex, self._text, flags=re.M)
if matches:
for match in matches:
start, end = match.start(), match.end()
match_width = end - start
remaining = (width - match_width) // 2
if start - remaining > 0:
context_start = self._text[start - remaining:start]
# cut the string short if it contains a newline character
context_start = context_start.split('\n')[-1]
else:
context_start = self._text[0:start + 1].split('\n')[-1]
context_end = self._text[end:end + remaining].split('\n')[0]
concordance.append(context_start + demarcation + self._text
[start:end] + demarcation + context_end)
if lines and len(concordance) >= lines:
break
print("Displaying %s matches:" % (len(concordance)))
print '\n'.join(concordance)
else:
print "No matches"
简而言之,nltk无法从当前状态的regex创建一致性。从nltk的
ConcordanceIndex
类(或其子类)创建一致性的困难在于,该类接受标记列表作为参数(并围绕这些标记构建),而不是全文字符串。在我想我的建议是创建您自己的类,它接受字符串作为参数而不是令牌。下面是一个松散地基于nltk的
ConcordanceIndex
类的类,可以作为起点:现在可以这样测试类:
^{pr2}$相关问题 更多 >
编程相关推荐