Removed the need for a spelling column in the lookup index. #15

Whoosh's spelling module unfortunately ignores any "words" that don't
look like words, even though the algorithm words fine with arbitrary
input.

I had to clone some code from whoosh.spelling, but avoiding the
isalpha() check solved a bunch of problems.  Now the index happily
compares against anything I feed into it.
This commit is contained in:
Eevee 2009-07-26 00:37:37 -07:00
parent bbfaf43d2b
commit cce9c26125

View file

@ -17,6 +17,14 @@ for cls in [
]: ]:
indexed_tables[cls.__tablename__] = cls indexed_tables[cls.__tablename__] = cls
# Dictionary of extra keys to file types of objects under, e.g. Pokémon can
# also be looked up purely by number
extra_keys = {
tables.Pokemon: [
lambda row: unicode(row.id),
],
}
index_bits = {} index_bits = {}
def get_index(session): def get_index(session):
"""Returns (index, speller). """Returns (index, speller).
@ -30,16 +38,17 @@ def get_index(session):
store = whoosh.store.RamStorage() store = whoosh.store.RamStorage()
schema = whoosh.fields.Schema( schema = whoosh.fields.Schema(
name=whoosh.fields.ID(stored=True), name=whoosh.fields.ID(stored=True),
spelling_name=whoosh.fields.ID(stored=True),
table=whoosh.fields.STORED, table=whoosh.fields.STORED,
row_id=whoosh.fields.STORED, row_id=whoosh.fields.STORED,
language_id=whoosh.fields.STORED, language_id=whoosh.fields.STORED,
) )
# Construct a straight lookup index
index = whoosh.index.Index(store, schema=schema, create=True) index = whoosh.index.Index(store, schema=schema, create=True)
writer = index.writer() writer = index.writer()
# Index every name in all our tables of interest # Index every name in all our tables of interest
speller_entries = []
for cls in indexed_tables.values(): for cls in indexed_tables.values():
q = session.query(cls) q = session.query(cls)
@ -48,24 +57,48 @@ def get_index(session):
q = q.filter_by(forme_base_pokemon_id=None) q = q.filter_by(forme_base_pokemon_id=None)
for row in q.yield_per(5): for row in q.yield_per(5):
row_key = dict(table=cls.__tablename__, row_id=row.id)
# Spelling index only indexes strings of letters, alas, so we
# reduce every name to this to make the index work. However, exact
# matches are not returned, so e.g. 'nidoran' would neither match
# exactly nor fuzzy-match. Solution: add the spelling-munged name
# as a regular index row too.
name = row.name.lower() name = row.name.lower()
spelling_name = re.sub('[^a-z]', '', name) writer.add_document(name=name, **row_key)
writer.add_document(name=name,
spelling_name=spelling_name, speller_entries.append(name)
table=cls.__tablename__,
row_id=row.id) for extra_key_func in extra_keys[cls]:
extra_key = extra_key_func(row)
writer.add_document(name=extra_key, **row_key)
writer.commit() writer.commit()
### Construct a spell-checker index # Construct and populate a spell-checker index. Quicker to do it all
# at once, as every call to add_* does a commit(), and those seem to be
# expensive
speller = whoosh.spelling.SpellChecker(index.storage) speller = whoosh.spelling.SpellChecker(index.storage)
# WARNING: HERE BE DRAGONS
# Can't use speller.add_field because it tries to intuit a frequency, and # whoosh.spelling refuses to index things that don't look like words.
# names are in an ID field, which seems to be immune to frequency. # Unfortunately, this doesn't work so well for Pokémon (Mr. Mime,
# Not hard to add everything ourselves, though # Porygon-Z, etc.), and attempts to work around it lead to further
reader = index.doc_reader() # complications.
speller.add_words([ _['spelling_name'] for _ in reader ]) # The below is copied from SpellChecker.add_scored_words without the check
reader.close() # for isalpha(). XXX get whoosh patched to make this unnecessary!
writer = whoosh.writing.IndexWriter(speller.index())
for word in speller_entries:
fields = {"word": word, "score": 1}
for size in xrange(speller.mingram, speller.maxgram + 1):
nga = whoosh.analysis.NgramAnalyzer(size)
gramlist = [t.text for t in nga(word)]
if len(gramlist) > 0:
fields["start%s" % size] = gramlist[0]
fields["end%s" % size] = gramlist[-1]
fields["gram%s" % size] = " ".join(gramlist)
writer.add_document(**fields)
writer.commit()
# end copy-pasta
index_bits['index'] = index index_bits['index'] = index
index_bits['speller'] = speller index_bits['speller'] = speller
@ -88,35 +121,23 @@ def lookup(session, name, exact_only=False):
exact = True exact = True
# Alas! We have to make three attempts to find anything with this index.
# First: Try an exact match for a name in the index.
# Second: Try an exact match for a stripped-down name in the index.
# Third: Get spelling suggestions.
# The spelling module apparently only indexes *words* -- that is, [a-z]+.
# So we have a separate field that contains the same name, stripped down to
# just [a-z]+.
# Unfortunately, exact matches aren't returned as spelling suggestions, so
# we also have to do a regular index match against this separate field.
# Otherwise, 'nidoran' will never match anything
index, speller = get_index(session) index, speller = get_index(session)
# Look for exact name # Look for exact name. A Term object does an exact match, so we don't have
parser = QueryParser('name', schema=index.schema) # to worry about a query parser tripping on weird characters in the input
results = index.find(name.lower(), parser=parser) searcher = index.searcher()
query = whoosh.query.Term('name', name)
results = searcher.search(query)
if not exact_only: if not exact_only:
# Look for a match with a reduced a-z name
if not results:
parser = QueryParser('spelling_name', schema=index.schema)
results = index.find(name.lower(), parser=parser)
# Look for some fuzzy matches # Look for some fuzzy matches
if not results: if not results:
results = []
exact = False exact = False
results = []
for suggestion in speller.suggest(name, 3): for suggestion in speller.suggest(name, 3):
results.extend( index.find(suggestion, parser=parser) ) query = whoosh.query.Term('name', suggestion)
results.extend(searcher.search(query))
# Convert results to db objects # Convert results to db objects
objects = [] objects = []