Use the new Whoosh API for spelling. May help #181.

This commit is contained in:
Petr Viktorin 2011-08-30 23:16:59 +03:00
parent bb49a6bc39
commit af8215535e
2 changed files with 8 additions and 36 deletions

View file

@ -13,6 +13,7 @@ import whoosh.index
from whoosh.qparser import QueryParser
import whoosh.scoring
import whoosh.spelling
from whoosh.support import levenshtein
from pokedex.compatibility import namedtuple
@ -86,12 +87,6 @@ class PokedexLookup(object):
MAX_EXACT_RESULTS = 43
INTERMEDIATE_FACTOR = 2
# The speller only checks how much the input matches a word; there can be
# all manner of extra unmatched junk, and it won't affect the weighting.
# To compensate, greatly boost the weighting of matches at the beginning
# and end, so nearly-full-word-matches are much better
SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
# Dictionary of table name => table class.
# Need the table name so we can get the class from the table name after we
# retrieve something from the index
@ -124,8 +119,7 @@ class PokedexLookup(object):
`pokedex setup`.
"""
# By the time this returns, self.index, self.speller, and self.session
# must be set
# By the time this returns, self.index and self.session must be set
# If a directory was not given, use the default
if directory is None:
@ -144,7 +138,6 @@ class PokedexLookup(object):
# rebuild_index before doing anything. Provide a dummy object that
# complains when used
self.index = UninitializedIndex()
self.speller = UninitializedIndex()
return
# Otherwise, already exists; should be an index! Bam, done.
@ -158,17 +151,11 @@ class PokedexLookup(object):
"Please use a dedicated directory for the lookup index."
)
# Create speller, and done
spell_store = whoosh.filedb.filestore.FileStorage(directory)
self.speller = whoosh.spelling.SpellChecker(spell_store,
**self.SPELLER_OPTIONS)
def rebuild_index(self):
"""Creates the index from scratch."""
schema = whoosh.fields.Schema(
name=whoosh.fields.ID(stored=True),
name=whoosh.fields.ID(stored=True, spelling=True),
table=whoosh.fields.ID(stored=True),
row_id=whoosh.fields.ID(stored=True),
language=whoosh.fields.STORED,
@ -191,7 +178,6 @@ class PokedexLookup(object):
writer = self.index.writer()
# Index every name in all our tables of interest
speller_entries = set()
for cls in self.indexed_tables.values():
q = self.session.query(cls).order_by(cls.id)
@ -208,9 +194,6 @@ class PokedexLookup(object):
**row_key
)
speller_entries.add(normalized_name)
if cls == tables.PokemonForm:
name_map = 'pokemon_name_map'
else:
@ -240,13 +223,6 @@ class PokedexLookup(object):
writer.commit()
# Construct and populate a spell-checker index. Quicker to do it all
# at once, as every call to add_* does a commit(), and those seem to be
# expensive
self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
**self.SPELLER_OPTIONS)
self.speller.add_words(speller_entries)
def normalize_name(self, name):
"""Strips irrelevant formatting junk from name input.
@ -509,16 +485,12 @@ class PokedexLookup(object):
fuzzy_query_parts = []
fuzzy_weights = {}
min_weight = [None]
for suggestion, _, weight in self.speller.suggestions_and_scores(name):
# Only allow the top 50% of scores; otherwise there will always
# be a lot of trailing junk
if min_weight[0] is None:
min_weight[0] = weight * 0.5
elif weight < min_weight[0]:
break
corrector = searcher.corrector('name')
for suggestion in corrector.suggest(name, limit=max_results):
fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
fuzzy_weights[suggestion] = weight
distance = levenshtein.relative(name, suggestion)
fuzzy_weights[suggestion] = distance
if not fuzzy_query_parts:
# Nothing at all; don't try querying

View file

@ -9,7 +9,7 @@ setup(
},
install_requires=[
'SQLAlchemy>=0.6.6',
'whoosh>=1.1.0',
'whoosh>=2.2.2',
'markdown',
'construct',
],