mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Added lookup support for foreign language names. #15
Changed lookup()'s return value to be a list of named tuples so the caller can know which language each result is in.
This commit is contained in:
parent
0b5eba6620
commit
4e51867e95
3 changed files with 55 additions and 32 deletions
|
@ -13,6 +13,11 @@ def main():
|
||||||
command = sys.argv[1]
|
command = sys.argv[1]
|
||||||
args = sys.argv[2:]
|
args = sys.argv[2:]
|
||||||
|
|
||||||
|
# XXX there must be a better way to get Unicode argv
|
||||||
|
# XXX this doesn't work on Windows durp
|
||||||
|
enc = sys.stdin.encoding
|
||||||
|
args = [_.decode(enc) for _ in args]
|
||||||
|
|
||||||
# Find the command as a function in this file
|
# Find the command as a function in this file
|
||||||
func = globals().get("command_%s" % command, None)
|
func = globals().get("command_%s" % command, None)
|
||||||
if func:
|
if func:
|
||||||
|
@ -53,14 +58,16 @@ def command_setup(*args):
|
||||||
|
|
||||||
|
|
||||||
def command_lookup(name):
|
def command_lookup(name):
|
||||||
results, exact = pokedex.lookup.lookup(name)
|
results = pokedex.lookup.lookup(name)
|
||||||
if exact:
|
if not results:
|
||||||
|
print "No matches."
|
||||||
|
elif results[0].exact:
|
||||||
print "Matched:"
|
print "Matched:"
|
||||||
else:
|
else:
|
||||||
print "Fuzzy-matched:"
|
print "Fuzzy-matched:"
|
||||||
|
|
||||||
for object in results:
|
for object, language, exact in results:
|
||||||
print object.__tablename__, object.name
|
print object.__tablename__, object.name, language
|
||||||
|
|
||||||
|
|
||||||
def command_help():
|
def command_help():
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
|
from collections import namedtuple
|
||||||
import os, os.path
|
import os, os.path
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import re
|
import re
|
||||||
|
@ -13,6 +14,7 @@ import whoosh.spelling
|
||||||
|
|
||||||
from pokedex.db import connect
|
from pokedex.db import connect
|
||||||
import pokedex.db.tables as tables
|
import pokedex.db.tables as tables
|
||||||
|
from pokedex.roomaji import romanize
|
||||||
|
|
||||||
# Dictionary of table name => table class.
|
# Dictionary of table name => table class.
|
||||||
# Need the table name so we can get the class from the table name after we
|
# Need the table name so we can get the class from the table name after we
|
||||||
|
@ -69,10 +71,9 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
if directory_exists and not recreate:
|
if directory_exists and not recreate:
|
||||||
# Already exists; should be an index!
|
# Already exists; should be an index!
|
||||||
try:
|
try:
|
||||||
index = whoosh.index.open_dir(directory, indexname='pokedex')
|
index = whoosh.index.open_dir(directory, indexname='MAIN')
|
||||||
spell_store = whoosh.filedb.filestore.FileStorage(directory)
|
spell_store = whoosh.filedb.filestore.FileStorage(directory)
|
||||||
speller = whoosh.spelling.SpellChecker(spell_store,
|
speller = whoosh.spelling.SpellChecker(spell_store)
|
||||||
indexname='spelling')
|
|
||||||
return index, speller
|
return index, speller
|
||||||
except whoosh.index.EmptyIndexError as e:
|
except whoosh.index.EmptyIndexError as e:
|
||||||
# Apparently not a real index. Fall out of the if and create it
|
# Apparently not a real index. Fall out of the if and create it
|
||||||
|
@ -90,8 +91,7 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
language=whoosh.fields.STORED,
|
language=whoosh.fields.STORED,
|
||||||
)
|
)
|
||||||
|
|
||||||
index = whoosh.index.create_in(directory, schema=schema,
|
index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
|
||||||
indexname='pokedex')
|
|
||||||
writer = index.writer()
|
writer = index.writer()
|
||||||
|
|
||||||
# Index every name in all our tables of interest
|
# Index every name in all our tables of interest
|
||||||
|
@ -106,42 +106,57 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
for row in q.yield_per(5):
|
for row in q.yield_per(5):
|
||||||
row_key = dict(table=cls.__tablename__, row_id=row.id)
|
row_key = dict(table=cls.__tablename__, row_id=row.id)
|
||||||
|
|
||||||
# Spelling index only indexes strings of letters, alas, so we
|
|
||||||
# reduce every name to this to make the index work. However, exact
|
|
||||||
# matches are not returned, so e.g. 'nidoran' would neither match
|
|
||||||
# exactly nor fuzzy-match. Solution: add the spelling-munged name
|
|
||||||
# as a regular index row too.
|
|
||||||
name = row.name.lower()
|
name = row.name.lower()
|
||||||
writer.add_document(name=name, **row_key)
|
writer.add_document(name=name, **row_key)
|
||||||
|
|
||||||
speller_entries.append(name)
|
speller_entries.append(name)
|
||||||
|
|
||||||
for extra_key_func in extra_keys.get(cls, []):
|
for extra_key_func in extra_keys.get(cls, []):
|
||||||
extra_key = extra_key_func(row)
|
extra_key = extra_key_func(row)
|
||||||
writer.add_document(name=extra_key, **row_key)
|
writer.add_document(name=extra_key, **row_key)
|
||||||
|
|
||||||
|
# Pokemon also get other languages
|
||||||
|
if cls == tables.Pokemon:
|
||||||
|
for foreign_name in row.foreign_names:
|
||||||
|
name = foreign_name.name.lower()
|
||||||
|
writer.add_document(name=name,
|
||||||
|
language=foreign_name.language.name,
|
||||||
|
**row_key)
|
||||||
|
speller_entries.append(name)
|
||||||
|
|
||||||
|
if foreign_name.language.name == 'Japanese':
|
||||||
|
# Add Roomaji too
|
||||||
|
roomaji = romanize(foreign_name.name).lower()
|
||||||
|
writer.add_document(name=roomaji,
|
||||||
|
language='Roomaji',
|
||||||
|
**row_key)
|
||||||
|
speller_entries.append(roomaji)
|
||||||
|
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
|
|
||||||
# Construct and populate a spell-checker index. Quicker to do it all
|
# Construct and populate a spell-checker index. Quicker to do it all
|
||||||
# at once, as every call to add_* does a commit(), and those seem to be
|
# at once, as every call to add_* does a commit(), and those seem to be
|
||||||
# expensive
|
# expensive
|
||||||
speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling')
|
speller = whoosh.spelling.SpellChecker(index.storage)
|
||||||
speller.add_words(speller_entries)
|
speller.add_words(speller_entries)
|
||||||
|
|
||||||
return index, speller
|
return index, speller
|
||||||
|
|
||||||
|
|
||||||
|
LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
|
||||||
def lookup(name, session=None, indices=None, exact_only=False):
|
def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
"""Attempts to find some sort of object, given a database session and name.
|
"""Attempts to find some sort of object, given a database session and name.
|
||||||
|
|
||||||
Returns (objects, exact) where `objects` is a list of database objects, and
|
Returns a list of named (object, language, exact) tuples. `object` is a
|
||||||
`exact` is True iff the given name matched the returned objects exactly.
|
database object, `language` is the name of the language in which the name
|
||||||
|
was found, and `exact` is True iff this was an exact match.
|
||||||
|
|
||||||
This function ONLY does fuzzy matching if there are no exact matches.
|
This function currently ONLY does fuzzy matching if there are no exact
|
||||||
|
matches.
|
||||||
|
|
||||||
Formes are not returned; "Shaymin" will return only grass Shaymin.
|
Formes are not returned; "Shaymin" will return only grass Shaymin.
|
||||||
|
|
||||||
Currently recognizes:
|
Recognizes:
|
||||||
- Pokémon names: "Eevee"
|
- Pokémon names: "Eevee"
|
||||||
|
|
||||||
`name`
|
`name`
|
||||||
|
@ -170,6 +185,8 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
else:
|
else:
|
||||||
index, speller = open_index()
|
index, speller = open_index()
|
||||||
|
|
||||||
|
name = unicode(name)
|
||||||
|
|
||||||
exact = True
|
exact = True
|
||||||
|
|
||||||
# Look for exact name. A Term object does an exact match, so we don't have
|
# Look for exact name. A Term object does an exact match, so we don't have
|
||||||
|
@ -178,17 +195,16 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
query = whoosh.query.Term('name', name.lower())
|
query = whoosh.query.Term('name', name.lower())
|
||||||
results = searcher.search(query)
|
results = searcher.search(query)
|
||||||
|
|
||||||
if not exact_only:
|
# Look for some fuzzy matches if necessary
|
||||||
# Look for some fuzzy matches
|
if not exact_only and not results:
|
||||||
if not results:
|
|
||||||
exact = False
|
exact = False
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for suggestion in speller.suggest(name, 3):
|
for suggestion in speller.suggest(name, 10):
|
||||||
query = whoosh.query.Term('name', suggestion)
|
query = whoosh.query.Term('name', suggestion)
|
||||||
results.extend(searcher.search(query))
|
results.extend(searcher.search(query))
|
||||||
|
|
||||||
# Convert results to db objects
|
### Convert results to db objects
|
||||||
objects = []
|
objects = []
|
||||||
seen = {}
|
seen = {}
|
||||||
for result in results:
|
for result in results:
|
||||||
|
@ -200,6 +216,6 @@ def lookup(name, session=None, indices=None, exact_only=False):
|
||||||
|
|
||||||
cls = indexed_tables[result['table']]
|
cls = indexed_tables[result['table']]
|
||||||
obj = session.query(cls).get(result['row_id'])
|
obj = session.query(cls).get(result['row_id'])
|
||||||
objects.append(obj)
|
objects.append(LookupResult(obj, result['language'], exact))
|
||||||
|
|
||||||
return objects, exact
|
return objects
|
||||||
|
|
|
@ -131,4 +131,4 @@ def romanize(string):
|
||||||
if last_kana == 'sokuon':
|
if last_kana == 'sokuon':
|
||||||
raise ValueError("Sokuon cannot be the last character.")
|
raise ValueError("Sokuon cannot be the last character.")
|
||||||
|
|
||||||
return ''.join(characters)
|
return unicode(''.join(characters))
|
||||||
|
|
Loading…
Reference in a new issue