我如何在从PythonTokenFi继承的PyLucene中创建我自己的TokenFilter

2024-09-30 14:26:37 发布

您现在位置:Python中文网/ 问答频道 /正文

所有人:

我正在pylucene4.9.0中开发自己的分析器,并在分析器中为CompoundTokenFilter创建了一个TokenFilter,因为DictionaryCompoundTokenFilter性能不太好。在

DictionaryCompoundTokenFilter使用了一个brute算法,但是我只想在复合词中的子词都在字典中时拆分复合词,比如当给定字典中同时包含“breast”和“cancer”时,“breastcarcine”被拆分。在

但在运行程序时,它显示“chartermatAttribute”对象的“length”属性不可读”,而且我找不到它有什么问题。 谢谢!在

from __future__ import division
import lucene, math, itertools

from java.lang import CharSequence
from java.io import IOException
from java.util import LinkedList
from org.apache.pylucene.analysis import PythonTokenStream
from org.apache.lucene.analysis import TokenFilter
from org.apache.pylucene.analysis import PythonTokenFilter
from org.apache.lucene.analysis import TokenStream
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.apache.lucene.analysis.tokenattributes import OffsetAttribute
from org.apache.lucene.analysis.tokenattributes import PositionIncrementAttribute
from org.apache.lucene.analysis.util import CharArraySet
from org.apache.lucene.util import AttributeSource
from org.apache.lucene.util import Version

class CompoundTokenFilter(PythonTokenFilter):

    def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):
        super(CompoundTokenFilter,self).__init__(input)
        self.matchVersion=matchVersion
        self.dictionary=dictionary
        self.tokens=LinkedList()
        self.minWordSize=DEFAULT_MIN_WORD_SIZE
        self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
        self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
        self.current=AttributeSource.State
        self.termAtt=input.addAttribute(CharTermAttribute.class_)
        self.offsetAtt=input.addAttribute(OffsetAttribute.class_)
        self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)
        self.input=input

    def decompose(self):
        l=self.termAtt.length()
        s=self.termAtt.subSequence(0,l)
        if s in self.dictionary:
            self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))
        else:

            d=filter(lambda x:len(x)>=self.minSubwordSize and len(x)<=self.maxSubwordSize in s,this.dictionary)
            if len(d)>0:
                start=int(math.floor(l/self.maxSubwordSize))
                end=int(math.ceil(l/self.minSubwordSize))
                subwords_combinations=[]
                for i in xrange(start,end+1):
                    subwords_combinations.extend(itertools.permutations(d,i))
                subwords_combinations=filter(lambda x:''.join(x)==s,subwords_combinations)
                subwords=sorted(set(reduce(lambda x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))
                for subword in subwords:
                    tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))

    def incrementToken(self):
        if (not self.tokens.isEmpty()):
            assert self.current!=None
            token=self.tokens.removeFirst()
            AttributeSource.restoreState(self.current)
            self.termAtt.setEmpty().append(token.txt)
            self.offsetAttribute.setOffset(token.startOffset, token.endOffset)
            self.posIncAtt.setPositionIncrement(0)
            return True

        self.current=None

        if(self.input.incrementToken()):
            if self.termAtt.length()>=self.minWordSize:
                decompose()
                if not tokens.isEmpty():
                    self.current=AttributeSource.captureState()
            return True
        else:
            return False

    def reset(self):
        super(CompoundTokenFilter,self).reset()
        self.tokens.clear()
        self.current=None

    class CompoundToken:
        def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
            compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)
            self.txt=compoundTokenFilter.termAtt.subSequence(offset, offset + length)

            startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
            endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()

            if matchVersion.onOrAfter(Version.LUCENE_4_4) or endOff - startOff != compoundTokenFilter.termAtt.length():
                self.startOffset = startOff
                self.endOffset = endOff
            else:
                newStart = startOff + offset
                self.startOffset = newStart
                self.endOffset = newStart + length

Tags: fromorgimportselfdefaultinputsizedictionary