Added lookup support for foreign language names. #15

Changed lookup()'s return value to be a list of named tuples so the
caller can know which language each result is in.
This commit is contained in:
Eevee 2009-08-21 00:30:01 -07:00
parent 0b5eba6620
commit 4e51867e95
3 changed files with 55 additions and 32 deletions

View file

@ -13,6 +13,11 @@ def main():
command = sys.argv[1] command = sys.argv[1]
args = sys.argv[2:] args = sys.argv[2:]
# XXX there must be a better way to get Unicode argv
# XXX this doesn't work on Windows durp
enc = sys.stdin.encoding
args = [_.decode(enc) for _ in args]
# Find the command as a function in this file # Find the command as a function in this file
func = globals().get("command_%s" % command, None) func = globals().get("command_%s" % command, None)
if func: if func:
@ -53,14 +58,16 @@ def command_setup(*args):
def command_lookup(name): def command_lookup(name):
results, exact = pokedex.lookup.lookup(name) results = pokedex.lookup.lookup(name)
if exact: if not results:
print "No matches."
elif results[0].exact:
print "Matched:" print "Matched:"
else: else:
print "Fuzzy-matched:" print "Fuzzy-matched:"
for object in results: for object, language, exact in results:
print object.__tablename__, object.name print object.__tablename__, object.name, language
def command_help(): def command_help():

View file

@ -1,4 +1,5 @@
# encoding: utf8 # encoding: utf8
from collections import namedtuple
import os, os.path import os, os.path
import pkg_resources import pkg_resources
import re import re
@ -13,6 +14,7 @@ import whoosh.spelling
from pokedex.db import connect from pokedex.db import connect
import pokedex.db.tables as tables import pokedex.db.tables as tables
from pokedex.roomaji import romanize
# Dictionary of table name => table class. # Dictionary of table name => table class.
# Need the table name so we can get the class from the table name after we # Need the table name so we can get the class from the table name after we
@ -69,10 +71,9 @@ def open_index(directory=None, session=None, recreate=False):
if directory_exists and not recreate: if directory_exists and not recreate:
# Already exists; should be an index! # Already exists; should be an index!
try: try:
index = whoosh.index.open_dir(directory, indexname='pokedex') index = whoosh.index.open_dir(directory, indexname='MAIN')
spell_store = whoosh.filedb.filestore.FileStorage(directory) spell_store = whoosh.filedb.filestore.FileStorage(directory)
speller = whoosh.spelling.SpellChecker(spell_store, speller = whoosh.spelling.SpellChecker(spell_store)
indexname='spelling')
return index, speller return index, speller
except whoosh.index.EmptyIndexError as e: except whoosh.index.EmptyIndexError as e:
# Apparently not a real index. Fall out of the if and create it # Apparently not a real index. Fall out of the if and create it
@ -90,8 +91,7 @@ def open_index(directory=None, session=None, recreate=False):
language=whoosh.fields.STORED, language=whoosh.fields.STORED,
) )
index = whoosh.index.create_in(directory, schema=schema, index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
indexname='pokedex')
writer = index.writer() writer = index.writer()
# Index every name in all our tables of interest # Index every name in all our tables of interest
@ -106,42 +106,57 @@ def open_index(directory=None, session=None, recreate=False):
for row in q.yield_per(5): for row in q.yield_per(5):
row_key = dict(table=cls.__tablename__, row_id=row.id) row_key = dict(table=cls.__tablename__, row_id=row.id)
# Spelling index only indexes strings of letters, alas, so we
# reduce every name to this to make the index work. However, exact
# matches are not returned, so e.g. 'nidoran' would neither match
# exactly nor fuzzy-match. Solution: add the spelling-munged name
# as a regular index row too.
name = row.name.lower() name = row.name.lower()
writer.add_document(name=name, **row_key) writer.add_document(name=name, **row_key)
speller_entries.append(name) speller_entries.append(name)
for extra_key_func in extra_keys.get(cls, []): for extra_key_func in extra_keys.get(cls, []):
extra_key = extra_key_func(row) extra_key = extra_key_func(row)
writer.add_document(name=extra_key, **row_key) writer.add_document(name=extra_key, **row_key)
# Pokemon also get other languages
if cls == tables.Pokemon:
for foreign_name in row.foreign_names:
name = foreign_name.name.lower()
writer.add_document(name=name,
language=foreign_name.language.name,
**row_key)
speller_entries.append(name)
if foreign_name.language.name == 'Japanese':
# Add Roomaji too
roomaji = romanize(foreign_name.name).lower()
writer.add_document(name=roomaji,
language='Roomaji',
**row_key)
speller_entries.append(roomaji)
writer.commit() writer.commit()
# Construct and populate a spell-checker index. Quicker to do it all # Construct and populate a spell-checker index. Quicker to do it all
# at once, as every call to add_* does a commit(), and those seem to be # at once, as every call to add_* does a commit(), and those seem to be
# expensive # expensive
speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling') speller = whoosh.spelling.SpellChecker(index.storage)
speller.add_words(speller_entries) speller.add_words(speller_entries)
return index, speller return index, speller
LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
def lookup(name, session=None, indices=None, exact_only=False): def lookup(name, session=None, indices=None, exact_only=False):
"""Attempts to find some sort of object, given a database session and name. """Attempts to find some sort of object, given a database session and name.
Returns (objects, exact) where `objects` is a list of database objects, and Returns a list of named (object, language, exact) tuples. `object` is a
`exact` is True iff the given name matched the returned objects exactly. database object, `language` is the name of the language in which the name
was found, and `exact` is True iff this was an exact match.
This function ONLY does fuzzy matching if there are no exact matches. This function currently ONLY does fuzzy matching if there are no exact
matches.
Formes are not returned; "Shaymin" will return only grass Shaymin. Formes are not returned; "Shaymin" will return only grass Shaymin.
Currently recognizes: Recognizes:
- Pokémon names: "Eevee" - Pokémon names: "Eevee"
`name` `name`
@ -170,6 +185,8 @@ def lookup(name, session=None, indices=None, exact_only=False):
else: else:
index, speller = open_index() index, speller = open_index()
name = unicode(name)
exact = True exact = True
# Look for exact name. A Term object does an exact match, so we don't have # Look for exact name. A Term object does an exact match, so we don't have
@ -178,17 +195,16 @@ def lookup(name, session=None, indices=None, exact_only=False):
query = whoosh.query.Term('name', name.lower()) query = whoosh.query.Term('name', name.lower())
results = searcher.search(query) results = searcher.search(query)
if not exact_only: # Look for some fuzzy matches if necessary
# Look for some fuzzy matches if not exact_only and not results:
if not results: exact = False
exact = False results = []
results = []
for suggestion in speller.suggest(name, 3): for suggestion in speller.suggest(name, 10):
query = whoosh.query.Term('name', suggestion) query = whoosh.query.Term('name', suggestion)
results.extend(searcher.search(query)) results.extend(searcher.search(query))
# Convert results to db objects ### Convert results to db objects
objects = [] objects = []
seen = {} seen = {}
for result in results: for result in results:
@ -200,6 +216,6 @@ def lookup(name, session=None, indices=None, exact_only=False):
cls = indexed_tables[result['table']] cls = indexed_tables[result['table']]
obj = session.query(cls).get(result['row_id']) obj = session.query(cls).get(result['row_id'])
objects.append(obj) objects.append(LookupResult(obj, result['language'], exact))
return objects, exact return objects

View file

@ -131,4 +131,4 @@ def romanize(string):
if last_kana == 'sokuon': if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.") raise ValueError("Sokuon cannot be the last character.")
return ''.join(characters) return unicode(''.join(characters))