Sqlite具有真正的“全文搜索”和拼写错误（FTS+spellfix一起使用）

2条回答

网友
1楼 · 编辑于 2024-09-22 16:37:13

公认的答案是好的（完全归功于他），这里有一个小小的变化，虽然没有公认的复杂案例的完整性，但有助于理解以下观点：
import sqlite3 db = sqlite3.connect(':memory:') db.enable_load_extension(True) db.load_extension('./spellfix') c = db.cursor() c.execute("CREATE VIRTUAL TABLE mytable2 USING fts4(description text)") c.execute("CREATE VIRTUAL TABLE mytable2_terms USING fts4aux(mytable2)") c.execute("CREATE VIRTUAL TABLE mytable3 USING spellfix1") c.execute("INSERT INTO mytable2 VALUES ('All the Carmichael numbers')") # populate the table c.execute("INSERT INTO mytable2 VALUES ('They are great')") c.execute("INSERT INTO mytable2 VALUES ('Here some other numbers')") c.execute("INSERT INTO mytable3(word) SELECT term FROM mytable2_terms WHERE col='*'") def search(query): # Correcting each query term with spellfix table correctedquery = [] for t in query.split(): spellfix_query = "SELECT word FROM mytable3 WHERE word MATCH ? and top=1" c.execute(spellfix_query, (t,)) r = c.fetchone() correctedquery.append(r[0] if r is not None else t) # correct the word if any match in the spellfix table; if no match, keep the word spelled as it is (then the search will give no result!) correctedquery = ' '.join(correctedquery) # Now do the FTS fts_query = 'SELECT * FROM mytable2 WHERE description MATCH ?' c.execute(fts_query, (correctedquery,)) return {'result': c.fetchall(), 'correctedquery': correctedquery, 'query': query} print(search('NUMBBERS carmickaeel')) print(search('some HERE')) print(search('some qsdhiuhsd'))
结果如下：
{'query': 'NUMBBERS carmickaeel', 'correctedquery': u'numbers carmichael', 'result': [(u'All the Carmichael numbers',)]}
{'query': 'some HERE', 'correctedquery': u'some here', 'result': [(u'Here some other numbers',)]}
{'query': 'some qsdhiuhsd', 'correctedquery': u'some qsdhiuhsd', 'result': []}
备注：可以注意到，“用spellfix table更正每个查询项”部分是通过每个术语一个SQL查询完成的。研究了这个与单个联合SQL查询的性能here。在

网友
2楼 · 编辑于 2024-09-22 16:37:13

spellfix1文档实际上告诉您如何做到这一点。从Overview section：
If you intend to use this virtual table in cooperation with an FTS4 table (for spelling correction of search terms) then you might extract the vocabulary using an fts4aux table:
INSERT INTO demo(word) SELECT term FROM search_aux WHERE col='*';
SELECT term from search_aux WHERE col='*'语句extracts all the indexed tokens。在
将其与您的示例相连接，其中mytable2是您的fts4虚拟表，您可以创建一个fts4aux表，并将这些标记插入到mytable3spellfix1表中，方法是：
CREATE VIRTUAL TABLE mytable2_terms USING fts4aux(mytable2); INSERT INTO mytable3(word) SELECT term FROM mytable2_terms WHERE col='*';
您可能需要进一步限定该查询，以跳过已插入spellfix1中的任何术语，否则最终会出现两个条目：
^{pr2}$
现在您可以使用mytable3将拼写错误的单词映射到已更正的标记，然后在MATCH查询中使用这些已更正的标记mytable2。在
根据您的nead，这可能意味着您需要自己进行令牌处理和查询构建；没有公开的fts4查询语法分析器。因此，需要拆分两个令牌搜索字符串，每个令牌运行spellfix1表以映射到现有令牌，然后将这些令牌输入到fts4查询。在
忽略SQL语法来处理此问题，使用Python进行拆分非常简单：
def spellcheck_terms(conn, terms): cursor = conn.cursor() base_spellfix = """ SELECT :term{0} as term, word FROM spellfix1data WHERE word MATCH :term{0} and top=1 """ terms = terms.split() params = {"term{}".format(i): t for i, t in enumerate(terms, 1)} query = " UNION ".join([ base_spellfix.format(i + 1) for i in range(len(params))]) cursor.execute(query, params) correction_map = dict(cursor) return " ".join([correction_map.get(t, t) for t in terms]) def spellchecked_search(conn, terms): corrected_terms = spellcheck_terms(conn, terms) cursor = conn.cursor() fts_query = 'SELECT * FROM mytable2 WHERE mytable2 MATCH ?' cursor.execute(fts_query, (corrected_terms,)) return cursor.fetchall()
然后为spellchecked_search(db, "NUMMBER carmickaeel")返回[('All the Carmichael numbers',)]。在
然后，在Python中保持拼写检查处理允许您根据需要支持更复杂的FTS查询；您可能需要reimplement the expression parser才能这样做，但至少Python为您提供了这样做的工具。在
一个完整的例子，将上述方法打包到一个类中，该类简单地将术语提取为字母数字字符序列（根据我对表达式语法规范的阅读，这就足够了）：
import re import sqlite3 import sys class FTS4SpellfixSearch(object): def __init__(self, conn, spellfix1_path): self.conn = conn self.conn.enable_load_extension(True) self.conn.load_extension(spellfix1_path) def create_schema(self): self.conn.executescript( """ CREATE VIRTUAL TABLE IF NOT EXISTS fts4data USING fts4(description text); CREATE VIRTUAL TABLE IF NOT EXISTS fts4data_terms USING fts4aux(fts4data); CREATE VIRTUAL TABLE IF NOT EXISTS spellfix1data USING spellfix1; """ ) def index_text(self, *text): cursor = self.conn.cursor() with self.conn: params = ((t,) for t in text) cursor.executemany("INSERT INTO fts4data VALUES (?)", params) cursor.execute( """ INSERT INTO spellfix1data(word) SELECT term FROM fts4data_terms WHERE col='*' AND term not in (SELECT word from spellfix1data_vocab) """ ) # fts3 / 4 search expression tokenizer # no attempt is made to validate the expression, only # to identify valid search terms and extract them. # the fts3/4 tokenizer considers any alphanumeric ASCII character # and character in the range U+0080 and over to be terms. if sys.maxunicode == 0xFFFF: # UCS2 build, keep it simple, match any UTF-16 codepoint 0080 and over _fts4_expr_terms = re.compile(u"[a-zA-Z0-9\u0080-\uffff]+") else: # UCS4 _fts4_expr_terms = re.compile(u"[a-zA-Z0-9\u0080-\U0010FFFF]+") def _terms_from_query(self, search_query): """Extract search terms from a fts3/4 query Returns a list of terms and a template such that template.format(*terms) reconstructs the original query. terms using partial* syntax are ignored, as you can't distinguish between a misspelled prefix search that happens to match existing tokens and a valid spelling that happens to have 'near' tokens in the spellfix1 database that would not otherwise be matched by fts4 """ template, terms, lastpos = [], [], 0 for match in self._fts4_expr_terms.finditer(search_query): token, (start, end) = match.group(), match.span() # skip columnname: and partial* terms by checking next character ismeta = search_query[end:end + 1] in {":", "*"} # skip digits if preceded by "NEAR/" ismeta = ismeta or ( token.isdigit() and template and template[-1] == "NEAR" and "/" in search_query[lastpos:start]) if token not in {"AND", "OR", "NOT", "NEAR"} and not ismeta: # full search term, not a keyword, column name or partial* terms.append(token) token = "{}" template += search_query[lastpos:start], token lastpos = end template.append(search_query[lastpos:]) return terms, "".join(template) def spellcheck_terms(self, search_query): cursor = self.conn.cursor() base_spellfix = """ SELECT :term{0} as term, word FROM spellfix1data WHERE word MATCH :term{0} and top=1 """ terms, template = self._terms_from_query(search_query) params = {"term{}".format(i): t for i, t in enumerate(terms, 1)} query = " UNION ".join( [base_spellfix.format(i + 1) for i in range(len(params))] ) cursor.execute(query, params) correction_map = dict(cursor) return template.format(*(correction_map.get(t, t) for t in terms)) def search(self, search_query): corrected_query = self.spellcheck_terms(search_query) cursor = self.conn.cursor() fts_query = "SELECT * FROM fts4data WHERE fts4data MATCH ?" cursor.execute(fts_query, (corrected_query,)) return { "terms": search_query, "corrected": corrected_query, "results": cursor.fetchall(), }
以及使用该类的交互式演示：
>>> db = sqlite3.connect(":memory:") >>> fts = FTS4SpellfixSearch(db, './spellfix') >>> fts.create_schema() >>> fts.index_text("All the Carmichael numbers") # your example >>> from pprint import pprint >>> pprint(fts.search('NUMMBER carmickaeel')) {'corrected': 'numbers carmichael', 'results': [('All the Carmichael numbers',)], 'terms': 'NUMMBER carmickaeel'} >>> fts.index_text( ... "They are great", ... "Here some other numbers", ... ) >>> pprint(fts.search('here some')) # edgecase, multiple spellfix matches {'corrected': 'here some', 'results': [('Here some other numbers',)], 'terms': 'here some'} >>> pprint(fts.search('NUMMBER NOT carmickaeel')) # using fts4 query syntax {'corrected': 'numbers NOT carmichael', 'results': [('Here some other numbers',)], 'terms': 'NUMMBER NOT carmickaeel'}

背景：

问题：

相关问题更多 >

编程相关推荐

热门问题

热门文章