mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Added support for lookup by other language name. #15
English fuzzy matches are preferred, followed by Roomaji and then everything else. The return tuple from lookup() now has a `name` parameter for the actual name that was matched.
This commit is contained in:
parent
4e51867e95
commit
2bc41e2c62
2 changed files with 46 additions and 24 deletions
|
@ -66,8 +66,12 @@ def command_lookup(name):
|
|||
else:
|
||||
print "Fuzzy-matched:"
|
||||
|
||||
for object, language, exact in results:
|
||||
print object.__tablename__, object.name, language
|
||||
for result in results:
|
||||
print "%s: %s" % (result.object.__tablename__, result.object.name),
|
||||
if result.language:
|
||||
print "(%s in %s)" % (result.name, result.language)
|
||||
else:
|
||||
print
|
||||
|
||||
|
||||
def command_help():
|
||||
|
|
|
@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
|
|||
writer = index.writer()
|
||||
|
||||
# Index every name in all our tables of interest
|
||||
# speller_entries becomes a list of (word, score) tuples; the score is 2
|
||||
# for English names, 1.5 for Roomaji, and 1 for everything else. I think
|
||||
# this biases the results in the direction most people expect, especially
|
||||
# when e.g. German names are very similar to English names
|
||||
speller_entries = []
|
||||
for cls in indexed_tables.values():
|
||||
q = session.query(cls)
|
||||
|
@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):
|
|||
|
||||
name = row.name.lower()
|
||||
writer.add_document(name=name, **row_key)
|
||||
speller_entries.append(name)
|
||||
speller_entries.append((name, 1))
|
||||
|
||||
for extra_key_func in extra_keys.get(cls, []):
|
||||
extra_key = extra_key_func(row)
|
||||
writer.add_document(name=extra_key, **row_key)
|
||||
|
||||
# Pokemon also get other languages
|
||||
if cls == tables.Pokemon:
|
||||
for foreign_name in row.foreign_names:
|
||||
name = foreign_name.name.lower()
|
||||
writer.add_document(name=name,
|
||||
for foreign_name in getattr(row, 'foreign_names', []):
|
||||
moonspeak = foreign_name.name.lower()
|
||||
if name == moonspeak:
|
||||
# Don't add the English name again as a different language;
|
||||
# no point and it makes spell results confusing
|
||||
continue
|
||||
|
||||
writer.add_document(name=moonspeak,
|
||||
language=foreign_name.language.name,
|
||||
**row_key)
|
||||
speller_entries.append(name)
|
||||
speller_entries.append((moonspeak, 3))
|
||||
|
||||
if foreign_name.language.name == 'Japanese':
|
||||
# Add Roomaji too
|
||||
if foreign_name.language.name == 'Japanese':
|
||||
roomaji = romanize(foreign_name.name).lower()
|
||||
writer.add_document(name=roomaji,
|
||||
language='Roomaji',
|
||||
writer.add_document(name=roomaji, language='Roomaji',
|
||||
**row_key)
|
||||
speller_entries.append(roomaji)
|
||||
speller_entries.append((roomaji, 8))
|
||||
|
||||
|
||||
writer.commit()
|
||||
|
@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
|
|||
# at once, as every call to add_* does a commit(), and those seem to be
|
||||
# expensive
|
||||
speller = whoosh.spelling.SpellChecker(index.storage)
|
||||
speller.add_words(speller_entries)
|
||||
speller.add_scored_words(speller_entries)
|
||||
|
||||
return index, speller
|
||||
|
||||
|
||||
LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
|
||||
LookupResult = namedtuple('LookupResult',
|
||||
['object', 'name', 'language', 'exact'])
|
||||
def lookup(name, session=None, indices=None, exact_only=False):
|
||||
"""Attempts to find some sort of object, given a database session and name.
|
||||
|
||||
Returns a list of named (object, language, exact) tuples. `object` is a
|
||||
database object, `language` is the name of the language in which the name
|
||||
was found, and `exact` is True iff this was an exact match.
|
||||
Returns a list of named (object, name, language, exact) tuples. `object`
|
||||
is a database object, `name` is the name under which the object was found,
|
||||
`language` is the name of the language in which the name was found, and
|
||||
`exact` is True iff this was an exact match.
|
||||
|
||||
This function currently ONLY does fuzzy matching if there are no exact
|
||||
matches.
|
||||
|
@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
|||
seen = {}
|
||||
for result in results:
|
||||
# Skip dupe results
|
||||
# Note! The speller prefers English names, but the query does not. So
|
||||
# "latias" comes over "ratiasu". "latias" matches only the English
|
||||
# row, comes out first, and all is well.
|
||||
# However! The speller could then return "foo" which happens to be the
|
||||
# name for two different things in different languages, and the
|
||||
# non-English one could appear preferred. This is not very likely.
|
||||
seen_key = result['table'], result['row_id']
|
||||
if seen_key in seen:
|
||||
continue
|
||||
|
@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
|||
|
||||
cls = indexed_tables[result['table']]
|
||||
obj = session.query(cls).get(result['row_id'])
|
||||
objects.append(LookupResult(obj, result['language'], exact))
|
||||
objects.append(LookupResult(object=obj,
|
||||
name=result['name'],
|
||||
language=result['language'],
|
||||
exact=exact))
|
||||
|
||||
return objects
|
||||
return objects[:5]
|
||||
|
|
Loading…
Reference in a new issue