Pokédex lookup now uses a whoosh index and spell-checker. #15

This commit is contained in:
Eevee 2009-07-25 01:28:33 -07:00
parent 8e08f88fd8
commit b13ffac247
3 changed files with 129 additions and 21 deletions

View file

@ -164,10 +164,14 @@ def lookup(engine_uri, name):
# XXX don't require uri! somehow # XXX don't require uri! somehow
session = connect(engine_uri) session = connect(engine_uri)
results = pokedex_lookup(session, name) results, exact = pokedex_lookup(session, name)
print "Matched:" if exact:
for object, matchiness in results: print "Matched:"
print object.__tablename__, object.name, "(%.03f)" % matchiness else:
print "Fuzzy-matched:"
for object in results:
print object.__tablename__, object.name
def help(): def help():

View file

@ -1,18 +1,84 @@
# encoding: utf8 # encoding: utf8
import re
from sqlalchemy.sql import func from sqlalchemy.sql import func
import whoosh
from whoosh.qparser import QueryParser
import whoosh.spelling
import pokedex.db.tables as tables import pokedex.db.tables as tables
def lookup(session, name): # Dictionary of table name => table class.
# Need the table name so we can get the class from the table name after we
# retrieve something from the index
indexed_tables = {}
for cls in [
tables.Pokemon,
]:
indexed_tables[cls.__tablename__] = cls
index_bits = {}
def get_index(session):
"""Returns (index, speller).
Creates an index if one does not exist.
"""
if index_bits:
return index_bits['index'], index_bits['speller']
store = whoosh.store.RamStorage()
schema = whoosh.fields.Schema(
name=whoosh.fields.ID(stored=True),
spelling_name=whoosh.fields.ID(stored=True),
table=whoosh.fields.STORED,
row_id=whoosh.fields.STORED,
language_id=whoosh.fields.STORED,
)
index = whoosh.index.Index(store, schema=schema, create=True)
writer = index.writer()
# Index every name in all our tables of interest
for cls in indexed_tables.values():
q = session.query(cls)
# Only index base Pokémon formes
if hasattr(cls, 'forme_base_pokemon_id'):
q = q.filter_by(forme_base_pokemon_id=None)
for row in q.yield_per(5):
name = row.name.lower()
spelling_name = re.sub('[^a-z]', '', name)
writer.add_document(name=name,
spelling_name=spelling_name,
table=cls.__tablename__,
row_id=row.id)
writer.commit()
### Construct a spell-checker index
speller = whoosh.spelling.SpellChecker(index.storage)
# Can't use speller.add_field because it tries to intuit a frequency, and
# names are in an ID field, which seems to be immune to frequency.
# Not hard to add everything ourselves, though
reader = index.doc_reader()
speller.add_words([ _['spelling_name'] for _ in reader ])
reader.close()
index_bits['index'] = index
index_bits['speller'] = speller
index_bits['store'] = store
return index_bits['index'], index_bits['speller']
def lookup(session, name, exact_only=False):
"""Attempts to find some sort of object, given a database session and name. """Attempts to find some sort of object, given a database session and name.
Returns a list of (object, matchiness) tuples. Matchiness is 1 for exact Returns (objects, exact) where `objects` is a list of database objects, and
matches. It is possible to get multiple exact matches; for example, `exact` is True iff the given name matched the returned objects exactly.
'Metronome' will match both the move and the item. In these cases, the
results are returned in rough order of "importance", e.g., Pokémon come
before moves come before types.
This function does fuzzy matching iff there are no exact matches. This function ONLY does fuzzy matching if there are no exact matches.
Formes are not returned; "Shaymin" will return only grass Shaymin. Formes are not returned; "Shaymin" will return only grass Shaymin.
@ -20,12 +86,50 @@ def lookup(session, name):
- Pokémon names: "Eevee" - Pokémon names: "Eevee"
""" """
q = session.query(tables.Pokemon) \ exact = True
.filter(func.lower(tables.Pokemon.name) == name.lower()) \
.filter_by(forme_base_pokemon_id=None)
try: # Alas! We have to make three attempts to find anything with this index.
result = q.one() # First: Try an exact match for a name in the index.
return [ (result, 1) ] # Second: Try an exact match for a stripped-down name in the index.
except: # Third: Get spelling suggestions.
return [] # The spelling module apparently only indexes *words* -- that is, [a-z]+.
# So we have a separate field that contains the same name, stripped down to
# just [a-z]+.
# Unfortunately, exact matches aren't returned as spelling suggestions, so
# we also have to do a regular index match against this separate field.
# Otherwise, 'nidoran' will never match anything
index, speller = get_index(session)
# Look for exact name
parser = QueryParser('name', schema=index.schema)
results = index.find(name.lower(), parser=parser)
if not exact_only:
# Look for a match with a reduced a-z name
if not results:
parser = QueryParser('spelling_name', schema=index.schema)
results = index.find(name.lower(), parser=parser)
# Look for some fuzzy matches
if not results:
results = []
exact = False
for suggestion in speller.suggest(name, 3):
results.extend( index.find(suggestion, parser=parser) )
# Convert results to db objects
objects = []
seen = {}
for result in results:
# Skip dupe results
seen_key = result['table'], result['row_id']
if seen_key in seen:
continue
seen[seen_key] = True
cls = indexed_tables[result['table']]
obj = session.query(cls).get(result['row_id'])
objects.append(obj)
return objects, exact

View file

@ -4,7 +4,7 @@ setup(
version = '0.1', version = '0.1',
packages = find_packages(), packages = find_packages(),
package_data = { '': 'data' }, package_data = { '': 'data' },
install_requires=['SQLAlchemy>=0.5.1'], install_requires=['SQLAlchemy>=0.5.1', 'whoosh>=0.1.24'],
entry_points = { entry_points = {
'console_scripts': [ 'console_scripts': [