Fix whoosh result sorting.

Now results are sorted by is-this-your-language (times levenshtein
distance, if appropriate), then by rough class of result (Pokémon, then
moves, then abilities, etc.) and finally by name.

This fixes a couple issues:

- If both a foreign name and a local name matched a wildcard lookup,
  you'll see the local name.  Before, you'd see whichever happened to be
  first alphabetically.

- Wildcard results are more likely to have useful stuff at the top,
  rather than being dominated by foreign junk and names of obscure
  locations.

This also updates our usage of the whoosh API, which was old and busted
as of 2.0 or so.
This commit is contained in:
Eevee 2011-09-09 22:31:38 -07:00
parent 514ac79216
commit eb6cae4bf3

View file

@ -12,6 +12,7 @@ import whoosh.filedb.fileindex
import whoosh.index import whoosh.index
from whoosh.qparser import QueryParser from whoosh.qparser import QueryParser
import whoosh.scoring import whoosh.scoring
import whoosh.sorting
import whoosh.spelling import whoosh.spelling
from whoosh.support import levenshtein from whoosh.support import levenshtein
@ -49,40 +50,51 @@ class UninitializedIndex(object):
"or lookup.rebuild_index() to create it." "or lookup.rebuild_index() to create it."
) )
class LanguageWeighting(whoosh.scoring.Weighting): def LanguageFacet(locale_ident, extra_weights={}):
"""A scoring class that forces otherwise-equal English results to come """Constructs a sorting function that bubbles results from the current
before foreign results. locale (given by `locale_ident`) to the top of the list.
`extra_weights` may be a dictionary of weights which will be factored in.
Intended for use with spelling corrections, which come along with their own
weightings.
""" """
def score(searcher, docnum):
def __init__(self, locale_ident, extra_weights={}, *args, **kwargs):
"""`extra_weights` may be a dictionary of weights which will be
factored in.
Intended for use with spelling corrections, which come along with their
own weightings.
"""
self.locale_ident = locale_ident
self.extra_weights = extra_weights
super(LanguageWeighting, self).__init__(*args, **kwargs)
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
doc = searcher.stored_fields(docnum) doc = searcher.stored_fields(docnum)
weight = extra_weights.get(doc['name'], 1.0)
# Apply extra weight doc_language = doc['language']
weight = weight * self.extra_weights.get(text, 1.0) if doc_language == locale_ident:
doc_language = doc.get('language')
if doc_language == self.locale_ident:
# Bump up names in the current locale # Bump up names in the current locale
return weight * 2.0 weight *= 2.0
elif doc_language == u'roomaji': elif doc_language == u'roomaji':
# Given that the Japanese names are the originals, it seems likely # Given that the Japanese names are the originals, it seems likely
# that basically anyone might want to look them up. Boost them a # that basically anyone might want to look them up. Boost them a
# little bit. # little bit.
return weight * 1.4 weight *= 1.4
return weight # Higher weights should come FIRST, but sorts are ascending. Negate
# the weight to fix this
return -weight
return whoosh.sorting.FunctionFacet(score)
_table_order = dict(
pokemon_species=1,
pokemon_forms=1,
moves=2,
abilities=3,
items=4,
types=5,
locations=6,
natures=7,
)
def _table_facet_impl(searcher, docnum):
u"""Implements a sort that puts different "types" of results in a
relatively natural order: Pokémon first, then moves, etc.
"""
doc = searcher.stored_fields(docnum)
return _table_order[doc['table']]
table_facet = whoosh.sorting.FunctionFacet(_table_facet_impl)
class PokedexLookup(object): class PokedexLookup(object):
@ -468,21 +480,22 @@ class PokedexLookup(object):
# Fuzzy are capped at 10, beyond which something is probably very # Fuzzy are capped at 10, beyond which something is probably very
# wrong. Exact matches -- that is, wildcards and ids -- are far less # wrong. Exact matches -- that is, wildcards and ids -- are far less
# constrained. # constrained.
# Also, exact matches are sorted by name, since weight doesn't matter.
sort_by = dict()
if exact_only: if exact_only:
max_results = self.MAX_EXACT_RESULTS max_results = self.MAX_EXACT_RESULTS
sort_by['sortedby'] = (u'table', u'name')
else: else:
max_results = self.MAX_FUZZY_RESULTS max_results = self.MAX_FUZZY_RESULTS
locale = self._get_current_locale() locale = self._get_current_locale()
searcher = self.index.searcher( facet = whoosh.sorting.MultiFacet([
weighting=LanguageWeighting(locale.identifier)) LanguageFacet(locale.identifier),
table_facet,
"name",
])
searcher = self.index.searcher()
results = searcher.search( results = searcher.search(
query, query,
limit=int(max_results * self.INTERMEDIATE_FACTOR), limit=int(max_results * self.INTERMEDIATE_FACTOR),
**sort_by sortedby=facet,
) )
# Look for some fuzzy matches if necessary # Look for some fuzzy matches if necessary
@ -492,10 +505,8 @@ class PokedexLookup(object):
fuzzy_query_parts = [] fuzzy_query_parts = []
fuzzy_weights = {} fuzzy_weights = {}
min_weight = [None]
corrector = searcher.corrector('name') corrector = searcher.corrector('name')
for suggestion in corrector.suggest(name, limit=max_results): for suggestion in corrector.suggest(name, limit=max_results):
fuzzy_query_parts.append(whoosh.query.Term('name', suggestion)) fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
distance = levenshtein.relative(name, suggestion) distance = levenshtein.relative(name, suggestion)
fuzzy_weights[suggestion] = distance fuzzy_weights[suggestion] = distance
@ -508,9 +519,9 @@ class PokedexLookup(object):
if type_term: if type_term:
fuzzy_query = fuzzy_query & type_term fuzzy_query = fuzzy_query & type_term
searcher.weighting = LanguageWeighting( sorter = LanguageFacet(
locale.identifier, extra_weights=fuzzy_weights) locale.identifier, extra_weights=fuzzy_weights)
results = searcher.search(fuzzy_query) results = searcher.search(fuzzy_query, sortedby=sorter)
### Convert results to db objects ### Convert results to db objects
objects = self._whoosh_records_to_results(results, exact=exact) objects = self._whoosh_records_to_results(results, exact=exact)
@ -570,7 +581,7 @@ class PokedexLookup(object):
locale = self._get_current_locale() locale = self._get_current_locale()
searcher = self.index.searcher() searcher = self.index.searcher()
searcher.weighting = LanguageWeighting(locale.identifier) facet = LanguageFacet(locale.identifier)
results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS) results = searcher.search(query, sortedby=facet) # XXX , limit=self.MAX_LOOKUP_RESULTS)
return self._whoosh_records_to_results(results) return self._whoosh_records_to_results(results)