Improve multilingual support in lookup.

Language identifiers are stored and retrieved, rather than English
names.

Language weighting biases towards the current language, rather than to
English.

Language is no longer considered nullable to indicate English.

Duplicate names in other languages are no longer omitted from the index.
This commit is contained in:
Eevee 2011-09-08 20:58:29 -07:00
parent 66988fb070
commit 514ac79216

View file

@ -54,13 +54,14 @@ class LanguageWeighting(whoosh.scoring.Weighting):
before foreign results. before foreign results.
""" """
def __init__(self, extra_weights={}, *args, **kwargs): def __init__(self, locale_ident, extra_weights={}, *args, **kwargs):
"""`extra_weights` may be a dictionary of weights which will be """`extra_weights` may be a dictionary of weights which will be
factored in. factored in.
Intended for use with spelling corrections, which come along with their Intended for use with spelling corrections, which come along with their
own weightings. own weightings.
""" """
self.locale_ident = locale_ident
self.extra_weights = extra_weights self.extra_weights = extra_weights
super(LanguageWeighting, self).__init__(*args, **kwargs) super(LanguageWeighting, self).__init__(*args, **kwargs)
@ -70,16 +71,18 @@ class LanguageWeighting(whoosh.scoring.Weighting):
# Apply extra weight # Apply extra weight
weight = weight * self.extra_weights.get(text, 1.0) weight = weight * self.extra_weights.get(text, 1.0)
language = doc.get('language') doc_language = doc.get('language')
if language is None:
# English (well, "default"); leave it at 1 if doc_language == self.locale_ident:
return weight # Bump up names in the current locale
elif language == u'Roomaji': return weight * 2.0
# Give Roomaji a little boost; it's most likely to be searched elif doc_language == u'roomaji':
return weight * 0.9 # Given that the Japanese names are the originals, it seems likely
else: # that basically anyone might want to look them up. Boost them a
# Everything else can drop down the totem pole # little bit.
return weight * 0.8 return weight * 1.4
return weight
class PokedexLookup(object): class PokedexLookup(object):
@ -199,27 +202,19 @@ class PokedexLookup(object):
else: else:
name_map = 'name_map' name_map = 'name_map'
seen = set([None]) for language, name in getattr(row, name_map, {}).items():
for language, name in sorted(getattr(row, name_map, {}).items(),
# Sort English first for now
key=lambda (l, n): (l.identifier != 'en', not l.official)):
if not name: if not name:
continue continue
if name in seen:
# Don't add the name again as a different
# language; no point and it makes spell results
# confusing
continue
seen.add(name)
add(name, language.name, add(name, language.identifier,
language.iso639, language.iso639,
language.iso3166) language.iso3166)
# Add Roomaji too # Add generated Roomaji too
# XXX this should be a first-class concept, not
# piggybacking on Japanese
if language.identifier == 'ja': if language.identifier == 'ja':
roomaji = romanize(name) add(romanize(name), language.identifier, language.iso639, language.iso3166)
add(roomaji, u'Roomaji', u'ja', u'jp')
writer.commit() writer.commit()
@ -353,6 +348,11 @@ class PokedexLookup(object):
"""Converts a list of whoosh's indexed records to LookupResult tuples """Converts a list of whoosh's indexed records to LookupResult tuples
containing database objects. containing database objects.
""" """
# XXX cache me?
languages = dict(
(row.identifier, row)
for row in self.session.query(tables.Language)
)
# XXX this 'exact' thing is getting kinda leaky. would like a better # XXX this 'exact' thing is getting kinda leaky. would like a better
# way to handle it, since only lookup() cares about fuzzy results # way to handle it, since only lookup() cares about fuzzy results
seen = {} seen = {}
@ -364,19 +364,25 @@ class PokedexLookup(object):
continue continue
seen[seen_key] = True seen[seen_key] = True
# XXX minimize queries here?
cls = self.indexed_tables[record['table']] cls = self.indexed_tables[record['table']]
obj = self.session.query(cls).get(record['row_id']) obj = self.session.query(cls).get(record['row_id'])
results.append(LookupResult(object=obj, results.append(LookupResult(object=obj,
indexed_name=record['name'], indexed_name=record['name'],
name=record['display_name'], name=record['display_name'],
language=record.get('language'), language=languages[record['language']],
iso639=record['iso639'], iso639=record['iso639'],
iso3166=record['iso3166'], iso3166=record['iso3166'],
exact=exact)) exact=exact))
return results return results
def _get_current_locale(self):
"""Returns the session's current default language, as an ORM row."""
return self.session.query(tables.Language).get(
self.session.default_language_id)
def lookup(self, input, valid_types=[], exact_only=False): def lookup(self, input, valid_types=[], exact_only=False):
"""Attempts to find some sort of object, given a name. """Attempts to find some sort of object, given a name.
@ -470,7 +476,9 @@ class PokedexLookup(object):
else: else:
max_results = self.MAX_FUZZY_RESULTS max_results = self.MAX_FUZZY_RESULTS
searcher = self.index.searcher(weighting=LanguageWeighting()) locale = self._get_current_locale()
searcher = self.index.searcher(
weighting=LanguageWeighting(locale.identifier))
results = searcher.search( results = searcher.search(
query, query,
limit=int(max_results * self.INTERMEDIATE_FACTOR), limit=int(max_results * self.INTERMEDIATE_FACTOR),
@ -500,7 +508,8 @@ class PokedexLookup(object):
if type_term: if type_term:
fuzzy_query = fuzzy_query & type_term fuzzy_query = fuzzy_query & type_term
searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights) searcher.weighting = LanguageWeighting(
locale.identifier, extra_weights=fuzzy_weights)
results = searcher.search(fuzzy_query) results = searcher.search(fuzzy_query)
### Convert results to db objects ### Convert results to db objects
@ -559,8 +568,9 @@ class PokedexLookup(object):
if type_term: if type_term:
query = query & type_term query = query & type_term
locale = self._get_current_locale()
searcher = self.index.searcher() searcher = self.index.searcher()
searcher.weighting = LanguageWeighting() searcher.weighting = LanguageWeighting(locale.identifier)
results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS) results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
return self._whoosh_records_to_results(results) return self._whoosh_records_to_results(results)