mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Added support for lookup by other language name. #15
English fuzzy matches are preferred, followed by Roomaji and then everything else. The return tuple from lookup() now has a `name` parameter for the actual name that was matched.
This commit is contained in:
parent
4e51867e95
commit
2bc41e2c62
2 changed files with 46 additions and 24 deletions
|
@ -66,8 +66,12 @@ def command_lookup(name):
|
||||||
else:
|
else:
|
||||||
print "Fuzzy-matched:"
|
print "Fuzzy-matched:"
|
||||||
|
|
||||||
for object, language, exact in results:
|
for result in results:
|
||||||
print object.__tablename__, object.name, language
|
print "%s: %s" % (result.object.__tablename__, result.object.name),
|
||||||
|
if result.language:
|
||||||
|
print "(%s in %s)" % (result.name, result.language)
|
||||||
|
else:
|
||||||
|
print
|
||||||
|
|
||||||
|
|
||||||
def command_help():
|
def command_help():
|
||||||
|
|
|
@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
writer = index.writer()
|
writer = index.writer()
|
||||||
|
|
||||||
# Index every name in all our tables of interest
|
# Index every name in all our tables of interest
|
||||||
|
# speller_entries becomes a list of (word, score) tuples; the score is 2
|
||||||
|
# for English names, 1.5 for Roomaji, and 1 for everything else. I think
|
||||||
|
# this biases the results in the direction most people expect, especially
|
||||||
|
# when e.g. German names are very similar to English names
|
||||||
speller_entries = []
|
speller_entries = []
|
||||||
for cls in indexed_tables.values():
|
for cls in indexed_tables.values():
|
||||||
q = session.query(cls)
|
q = session.query(cls)
|
||||||
|
@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
|
|
||||||
name = row.name.lower()
|
name = row.name.lower()
|
||||||
writer.add_document(name=name, **row_key)
|
writer.add_document(name=name, **row_key)
|
||||||
speller_entries.append(name)
|
speller_entries.append((name, 1))
|
||||||
|
|
||||||
for extra_key_func in extra_keys.get(cls, []):
|
for extra_key_func in extra_keys.get(cls, []):
|
||||||
extra_key = extra_key_func(row)
|
extra_key = extra_key_func(row)
|
||||||
writer.add_document(name=extra_key, **row_key)
|
writer.add_document(name=extra_key, **row_key)
|
||||||
|
|
||||||
# Pokemon also get other languages
|
# Pokemon also get other languages
|
||||||
if cls == tables.Pokemon:
|
for foreign_name in getattr(row, 'foreign_names', []):
|
||||||
for foreign_name in row.foreign_names:
|
moonspeak = foreign_name.name.lower()
|
||||||
name = foreign_name.name.lower()
|
if name == moonspeak:
|
||||||
writer.add_document(name=name,
|
# Don't add the English name again as a different language;
|
||||||
|
# no point and it makes spell results confusing
|
||||||
|
continue
|
||||||
|
|
||||||
|
writer.add_document(name=moonspeak,
|
||||||
language=foreign_name.language.name,
|
language=foreign_name.language.name,
|
||||||
**row_key)
|
**row_key)
|
||||||
speller_entries.append(name)
|
speller_entries.append((moonspeak, 3))
|
||||||
|
|
||||||
if foreign_name.language.name == 'Japanese':
|
|
||||||
# Add Roomaji too
|
# Add Roomaji too
|
||||||
|
if foreign_name.language.name == 'Japanese':
|
||||||
roomaji = romanize(foreign_name.name).lower()
|
roomaji = romanize(foreign_name.name).lower()
|
||||||
writer.add_document(name=roomaji,
|
writer.add_document(name=roomaji, language='Roomaji',
|
||||||
language='Roomaji',
|
|
||||||
**row_key)
|
**row_key)
|
||||||
speller_entries.append(roomaji)
|
speller_entries.append((roomaji, 8))
|
||||||
|
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
|
@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
# at once, as every call to add_* does a commit(), and those seem to be
|
# at once, as every call to add_* does a commit(), and those seem to be
|
||||||
# expensive
|
# expensive
|
||||||
speller = whoosh.spelling.SpellChecker(index.storage)
|
speller = whoosh.spelling.SpellChecker(index.storage)
|
||||||
speller.add_words(speller_entries)
|
speller.add_scored_words(speller_entries)
|
||||||
|
|
||||||
return index, speller
|
return index, speller
|
||||||
|
|
||||||
|
|
||||||
LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
|
LookupResult = namedtuple('LookupResult',
|
||||||
|
['object', 'name', 'language', 'exact'])
|
||||||
def lookup(name, session=None, indices=None, exact_only=False):
|
def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
"""Attempts to find some sort of object, given a database session and name.
|
"""Attempts to find some sort of object, given a database session and name.
|
||||||
|
|
||||||
Returns a list of named (object, language, exact) tuples. `object` is a
|
Returns a list of named (object, name, language, exact) tuples. `object`
|
||||||
database object, `language` is the name of the language in which the name
|
is a database object, `name` is the name under which the object was found,
|
||||||
was found, and `exact` is True iff this was an exact match.
|
`language` is the name of the language in which the name was found, and
|
||||||
|
`exact` is True iff this was an exact match.
|
||||||
|
|
||||||
This function currently ONLY does fuzzy matching if there are no exact
|
This function currently ONLY does fuzzy matching if there are no exact
|
||||||
matches.
|
matches.
|
||||||
|
@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
seen = {}
|
seen = {}
|
||||||
for result in results:
|
for result in results:
|
||||||
# Skip dupe results
|
# Skip dupe results
|
||||||
|
# Note! The speller prefers English names, but the query does not. So
|
||||||
|
# "latias" comes over "ratiasu". "latias" matches only the English
|
||||||
|
# row, comes out first, and all is well.
|
||||||
|
# However! The speller could then return "foo" which happens to be the
|
||||||
|
# name for two different things in different languages, and the
|
||||||
|
# non-English one could appear preferred. This is not very likely.
|
||||||
seen_key = result['table'], result['row_id']
|
seen_key = result['table'], result['row_id']
|
||||||
if seen_key in seen:
|
if seen_key in seen:
|
||||||
continue
|
continue
|
||||||
|
@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
|
|
||||||
cls = indexed_tables[result['table']]
|
cls = indexed_tables[result['table']]
|
||||||
obj = session.query(cls).get(result['row_id'])
|
obj = session.query(cls).get(result['row_id'])
|
||||||
objects.append(LookupResult(obj, result['language'], exact))
|
objects.append(LookupResult(object=obj,
|
||||||
|
name=result['name'],
|
||||||
|
language=result['language'],
|
||||||
|
exact=exact))
|
||||||
|
|
||||||
return objects
|
return objects[:5]
|
||||||
|
|
Loading…
Reference in a new issue