From 4e51867e952cc46cbf467be44e5528f8225ccc7d Mon Sep 17 00:00:00 2001 From: Eevee Date: Fri, 21 Aug 2009 00:30:01 -0700 Subject: [PATCH] Added lookup support for foreign language names. #15 Changed lookup()'s return value to be a list of named tuples so the caller can know which language each result is in. --- pokedex/__init__.py | 15 +++++++--- pokedex/lookup.py | 70 ++++++++++++++++++++++++++++----------------- pokedex/roomaji.py | 2 +- 3 files changed, 55 insertions(+), 32 deletions(-) diff --git a/pokedex/__init__.py b/pokedex/__init__.py index 1c28b06..4af3fa0 100644 --- a/pokedex/__init__.py +++ b/pokedex/__init__.py @@ -13,6 +13,11 @@ def main(): command = sys.argv[1] args = sys.argv[2:] + # XXX there must be a better way to get Unicode argv + # XXX this doesn't work on Windows durp + enc = sys.stdin.encoding + args = [_.decode(enc) for _ in args] + # Find the command as a function in this file func = globals().get("command_%s" % command, None) if func: @@ -53,14 +58,16 @@ def command_setup(*args): def command_lookup(name): - results, exact = pokedex.lookup.lookup(name) - if exact: + results = pokedex.lookup.lookup(name) + if not results: + print "No matches." + elif results[0].exact: print "Matched:" else: print "Fuzzy-matched:" - for object in results: - print object.__tablename__, object.name + for object, language, exact in results: + print object.__tablename__, object.name, language def command_help(): diff --git a/pokedex/lookup.py b/pokedex/lookup.py index 5643f66..0bf1c18 100644 --- a/pokedex/lookup.py +++ b/pokedex/lookup.py @@ -1,4 +1,5 @@ # encoding: utf8 +from collections import namedtuple import os, os.path import pkg_resources import re @@ -13,6 +14,7 @@ import whoosh.spelling from pokedex.db import connect import pokedex.db.tables as tables +from pokedex.roomaji import romanize # Dictionary of table name => table class. # Need the table name so we can get the class from the table name after we @@ -69,10 +71,9 @@ def open_index(directory=None, session=None, recreate=False): if directory_exists and not recreate: # Already exists; should be an index! try: - index = whoosh.index.open_dir(directory, indexname='pokedex') + index = whoosh.index.open_dir(directory, indexname='MAIN') spell_store = whoosh.filedb.filestore.FileStorage(directory) - speller = whoosh.spelling.SpellChecker(spell_store, - indexname='spelling') + speller = whoosh.spelling.SpellChecker(spell_store) return index, speller except whoosh.index.EmptyIndexError as e: # Apparently not a real index. Fall out of the if and create it @@ -90,8 +91,7 @@ def open_index(directory=None, session=None, recreate=False): language=whoosh.fields.STORED, ) - index = whoosh.index.create_in(directory, schema=schema, - indexname='pokedex') + index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN') writer = index.writer() # Index every name in all our tables of interest @@ -106,42 +106,57 @@ def open_index(directory=None, session=None, recreate=False): for row in q.yield_per(5): row_key = dict(table=cls.__tablename__, row_id=row.id) - # Spelling index only indexes strings of letters, alas, so we - # reduce every name to this to make the index work. However, exact - # matches are not returned, so e.g. 'nidoran' would neither match - # exactly nor fuzzy-match. Solution: add the spelling-munged name - # as a regular index row too. name = row.name.lower() writer.add_document(name=name, **row_key) - speller_entries.append(name) for extra_key_func in extra_keys.get(cls, []): extra_key = extra_key_func(row) writer.add_document(name=extra_key, **row_key) + # Pokemon also get other languages + if cls == tables.Pokemon: + for foreign_name in row.foreign_names: + name = foreign_name.name.lower() + writer.add_document(name=name, + language=foreign_name.language.name, + **row_key) + speller_entries.append(name) + + if foreign_name.language.name == 'Japanese': + # Add Roomaji too + roomaji = romanize(foreign_name.name).lower() + writer.add_document(name=roomaji, + language='Roomaji', + **row_key) + speller_entries.append(roomaji) + + writer.commit() # Construct and populate a spell-checker index. Quicker to do it all # at once, as every call to add_* does a commit(), and those seem to be # expensive - speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling') + speller = whoosh.spelling.SpellChecker(index.storage) speller.add_words(speller_entries) return index, speller +LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact']) def lookup(name, session=None, indices=None, exact_only=False): """Attempts to find some sort of object, given a database session and name. - Returns (objects, exact) where `objects` is a list of database objects, and - `exact` is True iff the given name matched the returned objects exactly. + Returns a list of named (object, language, exact) tuples. `object` is a + database object, `language` is the name of the language in which the name + was found, and `exact` is True iff this was an exact match. - This function ONLY does fuzzy matching if there are no exact matches. + This function currently ONLY does fuzzy matching if there are no exact + matches. Formes are not returned; "Shaymin" will return only grass Shaymin. - Currently recognizes: + Recognizes: - Pokémon names: "Eevee" `name` @@ -170,6 +185,8 @@ def lookup(name, session=None, indices=None, exact_only=False): else: index, speller = open_index() + name = unicode(name) + exact = True # Look for exact name. A Term object does an exact match, so we don't have @@ -178,17 +195,16 @@ def lookup(name, session=None, indices=None, exact_only=False): query = whoosh.query.Term('name', name.lower()) results = searcher.search(query) - if not exact_only: - # Look for some fuzzy matches - if not results: - exact = False - results = [] + # Look for some fuzzy matches if necessary + if not exact_only and not results: + exact = False + results = [] - for suggestion in speller.suggest(name, 3): - query = whoosh.query.Term('name', suggestion) - results.extend(searcher.search(query)) + for suggestion in speller.suggest(name, 10): + query = whoosh.query.Term('name', suggestion) + results.extend(searcher.search(query)) - # Convert results to db objects + ### Convert results to db objects objects = [] seen = {} for result in results: @@ -200,6 +216,6 @@ def lookup(name, session=None, indices=None, exact_only=False): cls = indexed_tables[result['table']] obj = session.query(cls).get(result['row_id']) - objects.append(obj) + objects.append(LookupResult(obj, result['language'], exact)) - return objects, exact + return objects diff --git a/pokedex/roomaji.py b/pokedex/roomaji.py index 0a6b269..d1bae0f 100644 --- a/pokedex/roomaji.py +++ b/pokedex/roomaji.py @@ -131,4 +131,4 @@ def romanize(string): if last_kana == 'sokuon': raise ValueError("Sokuon cannot be the last character.") - return ''.join(characters) + return unicode(''.join(characters))