mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Significantly improved lookup fuzzy matching.
This commit is contained in:
parent
5eb41ddeb6
commit
b4539aa346
1 changed files with 58 additions and 24 deletions
|
@ -52,23 +52,43 @@ class LanguageWeighting(whoosh.scoring.Weighting):
|
||||||
before foreign results.
|
before foreign results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, extra_weights={}, *args, **kwargs):
|
||||||
|
"""`extra_weights` may be a dictionary of weights which will be
|
||||||
|
factored in.
|
||||||
|
|
||||||
|
Intended for use with spelling corrections, which come along with their
|
||||||
|
own weightings.
|
||||||
|
"""
|
||||||
|
self.extra_weights = extra_weights
|
||||||
|
super(LanguageWeighting, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
|
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
|
||||||
doc = searcher.stored_fields(docnum)
|
doc = searcher.stored_fields(docnum)
|
||||||
|
|
||||||
|
# Apply extra weight
|
||||||
|
weight = weight * self.extra_weights.get(text, 1.0)
|
||||||
|
|
||||||
if doc['language'] == None:
|
if doc['language'] == None:
|
||||||
# English (well, "default"); leave it at 1
|
# English (well, "default"); leave it at 1
|
||||||
return weight
|
return weight
|
||||||
elif doc['language'] == u'Roomaji':
|
elif doc['language'] == u'Roomaji':
|
||||||
# Give Roomaji a little boost; it's most likely to be searched
|
# Give Roomaji a little boost; it's most likely to be searched
|
||||||
return weight * 0.95
|
return weight * 0.9
|
||||||
else:
|
else:
|
||||||
# Everything else can drop down the totem pole
|
# Everything else can drop down the totem pole
|
||||||
return weight * 0.9
|
return weight * 0.8
|
||||||
|
|
||||||
|
|
||||||
class PokedexLookup(object):
|
class PokedexLookup(object):
|
||||||
INTERMEDIATE_LOOKUP_RESULTS = 25
|
INTERMEDIATE_LOOKUP_RESULTS = 25
|
||||||
MAX_LOOKUP_RESULTS = 10
|
MAX_LOOKUP_RESULTS = 10
|
||||||
|
|
||||||
|
# The speller only checks how much the input matches a word; there can be
|
||||||
|
# all manner of extra unmatched junk, and it won't affect the weighting.
|
||||||
|
# To compensate, greatly boost the weighting of matches at the beginning
|
||||||
|
# and end, so nearly-full-word-matches are much better
|
||||||
|
SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
|
||||||
|
|
||||||
# Dictionary of table name => table class.
|
# Dictionary of table name => table class.
|
||||||
# Need the table name so we can get the class from the table name after we
|
# Need the table name so we can get the class from the table name after we
|
||||||
# retrieve something from the index
|
# retrieve something from the index
|
||||||
|
@ -136,7 +156,8 @@ class PokedexLookup(object):
|
||||||
|
|
||||||
# Create speller, and done
|
# Create speller, and done
|
||||||
spell_store = whoosh.filedb.filestore.FileStorage(directory)
|
spell_store = whoosh.filedb.filestore.FileStorage(directory)
|
||||||
self.speller = whoosh.spelling.SpellChecker(spell_store)
|
self.speller = whoosh.spelling.SpellChecker(spell_store,
|
||||||
|
**self.SPELLER_OPTIONS)
|
||||||
|
|
||||||
|
|
||||||
def rebuild_index(self):
|
def rebuild_index(self):
|
||||||
|
@ -159,11 +180,7 @@ class PokedexLookup(object):
|
||||||
writer = self.index.writer()
|
writer = self.index.writer()
|
||||||
|
|
||||||
# Index every name in all our tables of interest
|
# Index every name in all our tables of interest
|
||||||
# speller_entries becomes a list of (word, score) tuples; the score is
|
speller_entries = set()
|
||||||
# 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
|
|
||||||
# think this biases the results in the direction most people expect,
|
|
||||||
# especially when e.g. German names are very similar to English names
|
|
||||||
speller_entries = []
|
|
||||||
for cls in self.indexed_tables.values():
|
for cls in self.indexed_tables.values():
|
||||||
q = self.session.query(cls)
|
q = self.session.query(cls)
|
||||||
|
|
||||||
|
@ -171,7 +188,7 @@ class PokedexLookup(object):
|
||||||
row_key = dict(table=unicode(cls.__tablename__),
|
row_key = dict(table=unicode(cls.__tablename__),
|
||||||
row_id=unicode(row.id))
|
row_id=unicode(row.id))
|
||||||
|
|
||||||
def add(name, language, iso3166, score):
|
def add(name, language, iso3166):
|
||||||
normalized_name = self.normalize_name(name)
|
normalized_name = self.normalize_name(name)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
|
@ -180,21 +197,21 @@ class PokedexLookup(object):
|
||||||
**row_key
|
**row_key
|
||||||
)
|
)
|
||||||
|
|
||||||
speller_entries.append((normalized_name, score))
|
speller_entries.add(normalized_name)
|
||||||
|
|
||||||
|
|
||||||
# Add the basic English name to the index
|
# Add the basic English name to the index
|
||||||
if cls == tables.Pokemon:
|
if cls == tables.Pokemon:
|
||||||
# Pokémon need their form name added
|
# Pokémon need their form name added
|
||||||
# XXX kinda kludgy
|
# XXX kinda kludgy
|
||||||
add(row.full_name, None, u'us', 1)
|
add(row.full_name, None, u'us')
|
||||||
|
|
||||||
# If this is a default form, ALSO add the unadorned name,
|
# If this is a default form, ALSO add the unadorned name,
|
||||||
# so 'Deoxys' alone will still do the right thing
|
# so 'Deoxys' alone will still do the right thing
|
||||||
if row.forme_name and not row.forme_base_pokemon_id:
|
if row.forme_name and not row.forme_base_pokemon_id:
|
||||||
add(row.name, None, u'us', 1)
|
add(row.name, None, u'us')
|
||||||
else:
|
else:
|
||||||
add(row.name, None, u'us', 1)
|
add(row.name, None, u'us')
|
||||||
|
|
||||||
# Some things also have other languages' names
|
# Some things also have other languages' names
|
||||||
# XXX other language form names..?
|
# XXX other language form names..?
|
||||||
|
@ -207,21 +224,21 @@ class PokedexLookup(object):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
add(moonspeak, foreign_name.language.name,
|
add(moonspeak, foreign_name.language.name,
|
||||||
foreign_name.language.iso3166,
|
foreign_name.language.iso3166)
|
||||||
3)
|
|
||||||
|
|
||||||
# Add Roomaji too
|
# Add Roomaji too
|
||||||
if foreign_name.language.name == 'Japanese':
|
if foreign_name.language.name == 'Japanese':
|
||||||
roomaji = romanize(foreign_name.name)
|
roomaji = romanize(foreign_name.name)
|
||||||
add(roomaji, u'Roomaji', u'jp', 8)
|
add(roomaji, u'Roomaji', u'jp')
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
|
|
||||||
# Construct and populate a spell-checker index. Quicker to do it all
|
# Construct and populate a spell-checker index. Quicker to do it all
|
||||||
# at once, as every call to add_* does a commit(), and those seem to be
|
# at once, as every call to add_* does a commit(), and those seem to be
|
||||||
# expensive
|
# expensive
|
||||||
self.speller = whoosh.spelling.SpellChecker(self.index.storage)
|
self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
|
||||||
self.speller.add_scored_words(speller_entries)
|
**self.SPELLER_OPTIONS)
|
||||||
|
self.speller.add_words(speller_entries)
|
||||||
|
|
||||||
|
|
||||||
def normalize_name(self, name):
|
def normalize_name(self, name):
|
||||||
|
@ -445,13 +462,30 @@ class PokedexLookup(object):
|
||||||
exact = False
|
exact = False
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for suggestion in self.speller.suggest(
|
fuzzy_query_parts = []
|
||||||
name, self.INTERMEDIATE_LOOKUP_RESULTS):
|
fuzzy_weights = {}
|
||||||
|
min_weight = [None]
|
||||||
|
for suggestion, _, weight in self.speller.suggestions_and_scores(name):
|
||||||
|
# Only allow the top 50% of scores; otherwise there will always
|
||||||
|
# be a lot of trailing junk
|
||||||
|
if min_weight[0] is None:
|
||||||
|
min_weight[0] = weight * 0.5
|
||||||
|
elif weight < min_weight[0]:
|
||||||
|
break
|
||||||
|
|
||||||
query = whoosh.query.Term('name', suggestion)
|
fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
|
||||||
if type_term:
|
fuzzy_weights[suggestion] = weight
|
||||||
query = query & type_term
|
|
||||||
results.extend(searcher.search(query))
|
if not fuzzy_query_parts:
|
||||||
|
# Nothing at all; don't try querying
|
||||||
|
return []
|
||||||
|
|
||||||
|
fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
|
||||||
|
if type_term:
|
||||||
|
fuzzy_query = fuzzy_query & type_term
|
||||||
|
|
||||||
|
searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
|
||||||
|
results = searcher.search(fuzzy_query)
|
||||||
|
|
||||||
### Convert results to db objects
|
### Convert results to db objects
|
||||||
objects = self._whoosh_records_to_results(results, exact=exact)
|
objects = self._whoosh_records_to_results(results, exact=exact)
|
||||||
|
|
Loading…
Reference in a new issue