Added support for lookup by other language name. #15

English fuzzy matches are preferred, followed by Roomaji and then
everything else.

The return tuple from lookup() now has a `name` parameter for the actual
name that was matched.
This commit is contained in:
Eevee 2009-08-22 01:13:34 -07:00
parent 4e51867e95
commit 2bc41e2c62
2 changed files with 46 additions and 24 deletions

View file

@ -66,8 +66,12 @@ def command_lookup(name):
else: else:
print "Fuzzy-matched:" print "Fuzzy-matched:"
for object, language, exact in results: for result in results:
print object.__tablename__, object.name, language print "%s: %s" % (result.object.__tablename__, result.object.name),
if result.language:
print "(%s in %s)" % (result.name, result.language)
else:
print
def command_help(): def command_help():

View file

@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
writer = index.writer() writer = index.writer()
# Index every name in all our tables of interest # Index every name in all our tables of interest
# speller_entries becomes a list of (word, score) tuples; the score is 2
# for English names, 1.5 for Roomaji, and 1 for everything else. I think
# this biases the results in the direction most people expect, especially
# when e.g. German names are very similar to English names
speller_entries = [] speller_entries = []
for cls in indexed_tables.values(): for cls in indexed_tables.values():
q = session.query(cls) q = session.query(cls)
@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):
name = row.name.lower() name = row.name.lower()
writer.add_document(name=name, **row_key) writer.add_document(name=name, **row_key)
speller_entries.append(name) speller_entries.append((name, 1))
for extra_key_func in extra_keys.get(cls, []): for extra_key_func in extra_keys.get(cls, []):
extra_key = extra_key_func(row) extra_key = extra_key_func(row)
writer.add_document(name=extra_key, **row_key) writer.add_document(name=extra_key, **row_key)
# Pokemon also get other languages # Pokemon also get other languages
if cls == tables.Pokemon: for foreign_name in getattr(row, 'foreign_names', []):
for foreign_name in row.foreign_names: moonspeak = foreign_name.name.lower()
name = foreign_name.name.lower() if name == moonspeak:
writer.add_document(name=name, # Don't add the English name again as a different language;
language=foreign_name.language.name, # no point and it makes spell results confusing
**row_key) continue
speller_entries.append(name)
if foreign_name.language.name == 'Japanese': writer.add_document(name=moonspeak,
# Add Roomaji too language=foreign_name.language.name,
roomaji = romanize(foreign_name.name).lower() **row_key)
writer.add_document(name=roomaji, speller_entries.append((moonspeak, 3))
language='Roomaji',
**row_key) # Add Roomaji too
speller_entries.append(roomaji) if foreign_name.language.name == 'Japanese':
roomaji = romanize(foreign_name.name).lower()
writer.add_document(name=roomaji, language='Roomaji',
**row_key)
speller_entries.append((roomaji, 8))
writer.commit() writer.commit()
@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
# at once, as every call to add_* does a commit(), and those seem to be # at once, as every call to add_* does a commit(), and those seem to be
# expensive # expensive
speller = whoosh.spelling.SpellChecker(index.storage) speller = whoosh.spelling.SpellChecker(index.storage)
speller.add_words(speller_entries) speller.add_scored_words(speller_entries)
return index, speller return index, speller
LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact']) LookupResult = namedtuple('LookupResult',
['object', 'name', 'language', 'exact'])
def lookup(name, session=None, indices=None, exact_only=False): def lookup(name, session=None, indices=None, exact_only=False):
"""Attempts to find some sort of object, given a database session and name. """Attempts to find some sort of object, given a database session and name.
Returns a list of named (object, language, exact) tuples. `object` is a Returns a list of named (object, name, language, exact) tuples. `object`
database object, `language` is the name of the language in which the name is a database object, `name` is the name under which the object was found,
was found, and `exact` is True iff this was an exact match. `language` is the name of the language in which the name was found, and
`exact` is True iff this was an exact match.
This function currently ONLY does fuzzy matching if there are no exact This function currently ONLY does fuzzy matching if there are no exact
matches. matches.
@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
seen = {} seen = {}
for result in results: for result in results:
# Skip dupe results # Skip dupe results
# Note! The speller prefers English names, but the query does not. So
# "latias" comes over "ratiasu". "latias" matches only the English
# row, comes out first, and all is well.
# However! The speller could then return "foo" which happens to be the
# name for two different things in different languages, and the
# non-English one could appear preferred. This is not very likely.
seen_key = result['table'], result['row_id'] seen_key = result['table'], result['row_id']
if seen_key in seen: if seen_key in seen:
continue continue
@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):
cls = indexed_tables[result['table']] cls = indexed_tables[result['table']]
obj = session.query(cls).get(result['row_id']) obj = session.query(cls).get(result['row_id'])
objects.append(LookupResult(obj, result['language'], exact)) objects.append(LookupResult(object=obj,
name=result['name'],
language=result['language'],
exact=exact))
return objects return objects[:5]