改进我的代码，在一个大列表python中对相同的单词进行分组，并与其他cod进行比较

class Seeder: def __init__(self): self.seeds = set() self.cache = dict() def get_seed(self, word): LIMIT = 2 seed = self.cache.get(word,None) if seed is not None: return seed for seed in self.seeds: if self.distance(seed, word) <= LIMIT: self.cache[word] = seed return seed self.seeds.add(word) self.cache[word] = word return word def distance(self, s1, s2): l1 = len(s1) l2 = len(s2) matrix = [range(zz,zz + l1 + 1) for zz in xrange(l2 + 1)] for zz in xrange(0,l2): for sz in xrange(0,l1): if s1[sz] == s2[zz]: matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz]) else: matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1) return matrix[l2][l1] import itertools def group_similar(words): seeder = Seeder() words = sorted(words, key=seeder.get_seed) groups = itertools.groupby(words, key=seeder.get_seed)

d = collections.defaultdict(int) for i in residencyList: for x in uniqueResList: if x == i: if not d[x]: #print i, x d[x] = i #print d if d[x]: d[x] = d.get(x, ()) + ', ' + i else: #print 'no match' continue

2条回答

网友

1楼 · 编辑于 2024-09-26 17:50:08

我试着回答第一部分。类Seeder试图找到单词的seeds。假设两个相似的单词具有相同的种子，相似度由参数LIMIT（在本例中为2）控制，该参数测量两个单词之间的距离。有很多方法可以计算String distance，而你的班在{}函数中使用某种忍者数学来计算String distance，坦白地说，这是我无法理解的。

def __init__(self):
    self.seeds = set()
    self.cache = dict()

将seeds初始化为一个set，它可以跟踪到目前为止唯一的种子；一个cache，在我们已经看到这个单词的情况下加快查找速度（以节省计算时间）。

对于任何单词，get_seed函数返回其种子。

^{pr2}$

然后你根据单词的种子对这些单词进行排序。这样可以确保具有相同种子的单词相邻出现。这对于用于根据种子形成单词组的group by很重要。

distance函数看起来很复杂，可能会被Levenshtein之类的东西代替。

网友

2楼 · 编辑于 2024-09-26 17:50:08

《远处的忍者数学》的简短解释：

 # this is just the edit distance (Levenshtein) between the two words
    def distance(self, s1, s2):
        l1 = len(s1) # length of first word
        l2 = len(s2) # length of second word
        matrix = [range(zz,zz + l1 + 1) for zz in xrange(l2 + 1)] 
           # make an l2 + 1 by l1 + 1 matrix where the first row and column count up from
           # 0 to l1 and l2 respectively (these will be the costs of
           # deleting the letters that came before that element in each word)
        for zz in xrange(0,l2):
            for sz in xrange(0,l1):
                if s1[sz] == s2[zz]: # if the two letters are the same then we
                       # don't have to change them so take the 
                       # cheapest path from the options of
                       # matrix[zz+1][sz] + 1 (delete the letter in s1)
                       # matrix[zz][sz+1] + 1 (delete the letter in s2)
                       # matrix[zz][sz] (leave both letters)
                    matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
                else: # if the two letters are not the same then we
                         # have to change them so take the 
                         # cheapest path from the options of
                         # matrix[zz+1][sz] + 1 (delete the letter in s1)
                         # matrix[zz][sz+1] + 1 (delete the letter in s2)
                         # matrix[zz][sz] + 1 (swap a letter)
                    matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
        return matrix[l2][l1] # the value at the bottom of the matrix is equal to the cheapest set of edits

相关问题更多 >

编程相关推荐

热门问题

热门文章