如何使基本的倒排索引程序更加pythoni

class invertedIndex(object): def __init__(self,docs): self.docs,self.termList,self.docLists=docs,[],[] for index,doc in enumerate(docs): for term in doc.split(" "): if term in self.termList: i=self.termList.index(term) if index not in self.docLists[i]: self.docLists[i].append(index) else: self.termList.append(term) self.docLists.append([index]) def search(self,term): try: i=self.termList.index(term) return self.docLists[i] except: return "No results" docs=["new home sales top forecasts june june june", "home sales rise in july june", "increase in home sales in july", "july new home sales rise"] i=invertedIndex(docs) print invertedIndex.search("sales")

2条回答

网友

1楼 · 编辑于 2024-09-29 22:22:13

将doc指示符存储在Pythonset中，并使用dict引用每个术语的“doc set”。在

from collections import defaultdict

class invertedIndex(object):

  def __init__(self,docs):
      self.docSets = defaultdict(set)
      for index, doc in enumerate(docs):
          for term in doc.split():
              self.docSets[term].add(index)

  def search(self,term):
        return self.docSets[term]

docs=["new home sales top forecasts june june june",
                     "home sales rise in july june",
                     "increase in home sales in july",
                     "july new home sales rise"]

i=invertedIndex(docs)
print i.search("sales") # outputs: set([0, 1, 2, 3])

set的工作方式有点像列表，但无序，不能包含重复的条目。在

defaultdict基本上是一个dict，当没有数据可用时，它有一个默认类型（在本例中是一个空集）。在

网友

2楼 · 编辑于 2024-09-29 22:22:13

这个解决方案与@Peter Gibson的几乎相同，在这个版本中，索引是数据，不涉及委派的docSets对象。这使得代码稍微简短和清晰。在

代码还保留了文档的原始顺序。。。这是一个bug，我更喜欢Peter的set()实现。在

还请注意，引用不存在的项，如ix['garbage']，会隐式地修改索引。如果唯一的API是search，这是可以的，但是这个例子值得注意。在

来源

class InvertedIndex(dict):
    def __init__(self, docs):
        self.docs = docs

        for doc_index,doc in enumerate(docs):
            for term in doc.split(" "):
                self[term].append(doc_index)

    def __missing__(self, term):
        # operate like defaultdict(list)
        self[term] = []
        return self[term]

    def search(self, term):
        return self.get(term) or 'No results'


docs=["new home sales top forecasts june june june",
      "home sales rise in july june",
      "increase in home sales in july",
      "july new home sales rise",
      'beer',
      ]

ix = InvertedIndex(docs)
print ix.__dict__
print
print 'sales:',ix.search("sales")
print 'whiskey:', ix.search('whiskey')
print 'beer:', ix.search('beer')

print '\nTEST OF KEY SETTING'
print ix['garbage']
print 'garbage' in ix
print ix.search('garbage')

输出

^{pr2}$

来源

输出

相关问题更多 >

编程相关推荐

热门问题

热门文章