Pokédex lookup now uses a whoosh index and spell-checker. #15

2024-08-20 18:16:34 +00:00 · 2009-07-25 01:28:33 -07:00 · 2009-07-25 01:28:33 -07:00 · b13ffac247
commit b13ffac247
parent 8e08f88fd8
3 changed files with 129 additions and 21 deletions
--- a/pokedex/init.py
+++ b/pokedex/init.py
@ -164,10 +164,14 @@ def lookup(engine_uri, name):
    # XXX don't require uri!  somehow
    session = connect(engine_uri)
-    results = pokedex_lookup(session, name)
+    results, exact = pokedex_lookup(session, name)
-    print "Matched:"
+    if exact:
-    for object, matchiness in results:
+        print "Matched:"
-        print object.__tablename__, object.name, "(%.03f)" % matchiness
+    else:
        print "Fuzzy-matched:"
    for object in results:
        print object.__tablename__, object.name
 def help():
--- a/pokedex/lookup.py
+++ b/pokedex/lookup.py
@ -1,18 +1,84 @@
 # encoding: utf8
 import re
 from sqlalchemy.sql import func
 import whoosh
 from whoosh.qparser import QueryParser
 import whoosh.spelling
 import pokedex.db.tables as tables
-def lookup(session, name):
+# Dictionary of table name => table class.
 # Need the table name so we can get the class from the table name after we
 # retrieve something from the index
 indexed_tables = {}
 for cls in [
        tables.Pokemon,
    ]:
    indexed_tables[cls.__tablename__] = cls
 index_bits = {}
 def get_index(session):
    """Returns (index, speller).
    Creates an index if one does not exist.
    """
    if index_bits:
        return index_bits['index'], index_bits['speller']
    store = whoosh.store.RamStorage()
    schema = whoosh.fields.Schema(
        name=whoosh.fields.ID(stored=True),
        spelling_name=whoosh.fields.ID(stored=True),
        table=whoosh.fields.STORED,
        row_id=whoosh.fields.STORED,
        language_id=whoosh.fields.STORED,
    )
    index = whoosh.index.Index(store, schema=schema, create=True)
    writer = index.writer()
    # Index every name in all our tables of interest
    for cls in indexed_tables.values():
        q = session.query(cls)
        # Only index base Pokémon formes
        if hasattr(cls, 'forme_base_pokemon_id'):
            q = q.filter_by(forme_base_pokemon_id=None)
        for row in q.yield_per(5):
            name = row.name.lower()
            spelling_name = re.sub('[^a-z]', '', name)
            writer.add_document(name=name,
                                spelling_name=spelling_name,
                                table=cls.__tablename__,
                                row_id=row.id)
    writer.commit()
    ### Construct a spell-checker index
    speller = whoosh.spelling.SpellChecker(index.storage)
    # Can't use speller.add_field because it tries to intuit a frequency, and
    # names are in an ID field, which seems to be immune to frequency.
    # Not hard to add everything ourselves, though
    reader = index.doc_reader()
    speller.add_words([ _['spelling_name'] for _ in reader ])
    reader.close()
    index_bits['index'] = index
    index_bits['speller'] = speller
    index_bits['store'] = store
    return index_bits['index'], index_bits['speller']
 def lookup(session, name, exact_only=False):
    """Attempts to find some sort of object, given a database session and name.
-    Returns a list of (object, matchiness) tuples.  Matchiness is 1 for exact
+    Returns (objects, exact) where `objects` is a list of database objects, and
-    matches.  It is possible to get multiple exact matches; for example,
+    `exact` is True iff the given name matched the returned objects exactly.
    'Metronome' will match both the move and the item.  In these cases, the
    results are returned in rough order of "importance", e.g., Pokémon come
    before moves come before types.
-    This function does fuzzy matching iff there are no exact matches.
+    This function ONLY does fuzzy matching if there are no exact matches.
    Formes are not returned; "Shaymin" will return only grass Shaymin.
@ -20,12 +86,50 @@ def lookup(session, name):
    - Pokémon names: "Eevee"
    """
-    q = session.query(tables.Pokemon) \
+    exact = True
               .filter(func.lower(tables.Pokemon.name) == name.lower()) \
               .filter_by(forme_base_pokemon_id=None)
-    try:
+    # Alas!  We have to make three attempts to find anything with this index.
-        result = q.one()
+    # First: Try an exact match for a name in the index.
-        return [ (result, 1) ]
+    # Second: Try an exact match for a stripped-down name in the index.
-    except:
+    # Third: Get spelling suggestions.
-        return []
+    # The spelling module apparently only indexes *words* -- that is, [a-z]+.
    # So we have a separate field that contains the same name, stripped down to
    # just [a-z]+.
    # Unfortunately, exact matches aren't returned as spelling suggestions, so
    # we also have to do a regular index match against this separate field.
    # Otherwise, 'nidoran' will never match anything
    index, speller = get_index(session)
    # Look for exact name
    parser = QueryParser('name', schema=index.schema)
    results = index.find(name.lower(), parser=parser)
    if not exact_only:
        # Look for a match with a reduced a-z name
        if not results:
            parser = QueryParser('spelling_name', schema=index.schema)
            results = index.find(name.lower(), parser=parser)
        # Look for some fuzzy matches
        if not results:
            results = []
            exact = False
            for suggestion in speller.suggest(name, 3):
                results.extend( index.find(suggestion, parser=parser) )
    # Convert results to db objects
    objects = []
    seen = {}
    for result in results:
        # Skip dupe results
        seen_key = result['table'], result['row_id']
        if seen_key in seen:
            continue
        seen[seen_key] = True
        cls = indexed_tables[result['table']]
        obj = session.query(cls).get(result['row_id'])
        objects.append(obj)
    return objects, exact
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,7 @@ setup(
    version = '0.1',
    packages = find_packages(),
    package_data = { '': 'data' },
-    install_requires=['SQLAlchemy>=0.5.1'],
+    install_requires=['SQLAlchemy>=0.5.1', 'whoosh>=0.1.24'],
    entry_points = {
        'console_scripts': [