mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Use the new Whoosh API for spelling. May help #181.
This commit is contained in:
parent
bb49a6bc39
commit
af8215535e
2 changed files with 8 additions and 36 deletions
|
@ -13,6 +13,7 @@ import whoosh.index
|
||||||
from whoosh.qparser import QueryParser
|
from whoosh.qparser import QueryParser
|
||||||
import whoosh.scoring
|
import whoosh.scoring
|
||||||
import whoosh.spelling
|
import whoosh.spelling
|
||||||
|
from whoosh.support import levenshtein
|
||||||
|
|
||||||
from pokedex.compatibility import namedtuple
|
from pokedex.compatibility import namedtuple
|
||||||
|
|
||||||
|
@ -86,12 +87,6 @@ class PokedexLookup(object):
|
||||||
MAX_EXACT_RESULTS = 43
|
MAX_EXACT_RESULTS = 43
|
||||||
INTERMEDIATE_FACTOR = 2
|
INTERMEDIATE_FACTOR = 2
|
||||||
|
|
||||||
# The speller only checks how much the input matches a word; there can be
|
|
||||||
# all manner of extra unmatched junk, and it won't affect the weighting.
|
|
||||||
# To compensate, greatly boost the weighting of matches at the beginning
|
|
||||||
# and end, so nearly-full-word-matches are much better
|
|
||||||
SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
|
|
||||||
|
|
||||||
# Dictionary of table name => table class.
|
# Dictionary of table name => table class.
|
||||||
# Need the table name so we can get the class from the table name after we
|
# Need the table name so we can get the class from the table name after we
|
||||||
# retrieve something from the index
|
# retrieve something from the index
|
||||||
|
@ -124,8 +119,7 @@ class PokedexLookup(object):
|
||||||
`pokedex setup`.
|
`pokedex setup`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# By the time this returns, self.index, self.speller, and self.session
|
# By the time this returns, self.index and self.session must be set
|
||||||
# must be set
|
|
||||||
|
|
||||||
# If a directory was not given, use the default
|
# If a directory was not given, use the default
|
||||||
if directory is None:
|
if directory is None:
|
||||||
|
@ -144,7 +138,6 @@ class PokedexLookup(object):
|
||||||
# rebuild_index before doing anything. Provide a dummy object that
|
# rebuild_index before doing anything. Provide a dummy object that
|
||||||
# complains when used
|
# complains when used
|
||||||
self.index = UninitializedIndex()
|
self.index = UninitializedIndex()
|
||||||
self.speller = UninitializedIndex()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# Otherwise, already exists; should be an index! Bam, done.
|
# Otherwise, already exists; should be an index! Bam, done.
|
||||||
|
@ -158,17 +151,11 @@ class PokedexLookup(object):
|
||||||
"Please use a dedicated directory for the lookup index."
|
"Please use a dedicated directory for the lookup index."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create speller, and done
|
|
||||||
spell_store = whoosh.filedb.filestore.FileStorage(directory)
|
|
||||||
self.speller = whoosh.spelling.SpellChecker(spell_store,
|
|
||||||
**self.SPELLER_OPTIONS)
|
|
||||||
|
|
||||||
|
|
||||||
def rebuild_index(self):
|
def rebuild_index(self):
|
||||||
"""Creates the index from scratch."""
|
"""Creates the index from scratch."""
|
||||||
|
|
||||||
schema = whoosh.fields.Schema(
|
schema = whoosh.fields.Schema(
|
||||||
name=whoosh.fields.ID(stored=True),
|
name=whoosh.fields.ID(stored=True, spelling=True),
|
||||||
table=whoosh.fields.ID(stored=True),
|
table=whoosh.fields.ID(stored=True),
|
||||||
row_id=whoosh.fields.ID(stored=True),
|
row_id=whoosh.fields.ID(stored=True),
|
||||||
language=whoosh.fields.STORED,
|
language=whoosh.fields.STORED,
|
||||||
|
@ -191,7 +178,6 @@ class PokedexLookup(object):
|
||||||
writer = self.index.writer()
|
writer = self.index.writer()
|
||||||
|
|
||||||
# Index every name in all our tables of interest
|
# Index every name in all our tables of interest
|
||||||
speller_entries = set()
|
|
||||||
for cls in self.indexed_tables.values():
|
for cls in self.indexed_tables.values():
|
||||||
q = self.session.query(cls).order_by(cls.id)
|
q = self.session.query(cls).order_by(cls.id)
|
||||||
|
|
||||||
|
@ -208,9 +194,6 @@ class PokedexLookup(object):
|
||||||
**row_key
|
**row_key
|
||||||
)
|
)
|
||||||
|
|
||||||
speller_entries.add(normalized_name)
|
|
||||||
|
|
||||||
|
|
||||||
if cls == tables.PokemonForm:
|
if cls == tables.PokemonForm:
|
||||||
name_map = 'pokemon_name_map'
|
name_map = 'pokemon_name_map'
|
||||||
else:
|
else:
|
||||||
|
@ -240,13 +223,6 @@ class PokedexLookup(object):
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
|
|
||||||
# Construct and populate a spell-checker index. Quicker to do it all
|
|
||||||
# at once, as every call to add_* does a commit(), and those seem to be
|
|
||||||
# expensive
|
|
||||||
self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
|
|
||||||
**self.SPELLER_OPTIONS)
|
|
||||||
self.speller.add_words(speller_entries)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_name(self, name):
|
def normalize_name(self, name):
|
||||||
"""Strips irrelevant formatting junk from name input.
|
"""Strips irrelevant formatting junk from name input.
|
||||||
|
@ -509,16 +485,12 @@ class PokedexLookup(object):
|
||||||
fuzzy_query_parts = []
|
fuzzy_query_parts = []
|
||||||
fuzzy_weights = {}
|
fuzzy_weights = {}
|
||||||
min_weight = [None]
|
min_weight = [None]
|
||||||
for suggestion, _, weight in self.speller.suggestions_and_scores(name):
|
corrector = searcher.corrector('name')
|
||||||
# Only allow the top 50% of scores; otherwise there will always
|
for suggestion in corrector.suggest(name, limit=max_results):
|
||||||
# be a lot of trailing junk
|
|
||||||
if min_weight[0] is None:
|
|
||||||
min_weight[0] = weight * 0.5
|
|
||||||
elif weight < min_weight[0]:
|
|
||||||
break
|
|
||||||
|
|
||||||
fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
|
fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
|
||||||
fuzzy_weights[suggestion] = weight
|
distance = levenshtein.relative(name, suggestion)
|
||||||
|
fuzzy_weights[suggestion] = distance
|
||||||
|
|
||||||
if not fuzzy_query_parts:
|
if not fuzzy_query_parts:
|
||||||
# Nothing at all; don't try querying
|
# Nothing at all; don't try querying
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -9,7 +9,7 @@ setup(
|
||||||
},
|
},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'SQLAlchemy>=0.6.6',
|
'SQLAlchemy>=0.6.6',
|
||||||
'whoosh>=1.1.0',
|
'whoosh>=2.2.2',
|
||||||
'markdown',
|
'markdown',
|
||||||
'construct',
|
'construct',
|
||||||
],
|
],
|
||||||
|
|
Loading…
Reference in a new issue