Finally weight lookup results by language. #15

This commit is contained in:
Eevee 2009-08-22 19:44:57 -07:00
parent 9a20969336
commit a9a88ec3e4

View file

@ -10,12 +10,15 @@ import whoosh.filedb.filestore
import whoosh.filedb.fileindex import whoosh.filedb.fileindex
import whoosh.index import whoosh.index
from whoosh.qparser import QueryParser from whoosh.qparser import QueryParser
import whoosh.scoring
import whoosh.spelling import whoosh.spelling
from pokedex.db import connect from pokedex.db import connect
import pokedex.db.tables as tables import pokedex.db.tables as tables
from pokedex.roomaji import romanize from pokedex.roomaji import romanize
__all__ = ['open_index', 'lookup']
# Dictionary of table name => table class. # Dictionary of table name => table class.
# Need the table name so we can get the class from the table name after we # Need the table name so we can get the class from the table name after we
# retrieve something from the index # retrieve something from the index
@ -135,6 +138,23 @@ def open_index(directory=None, session=None, recreate=False):
return index, speller return index, speller
class LanguageWeighting(whoosh.scoring.Weighting):
"""A scoring class that forces otherwise-equal English results to come
before foreign results.
"""
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
doc = searcher.stored_fields(docnum)
if doc['language'] == None:
# English (well, "default"); leave it at 1
return weight
elif doc['language'] == u'Roomaji':
# Give Roomaji a bit of a boost, as it's most likely to be searched
return weight * 0.95
else:
# Everything else can drop down the totem pole
return weight * 0.9
rx_is_number = re.compile('^\d+$') rx_is_number = re.compile('^\d+$')
LookupResult = namedtuple('LookupResult', LookupResult = namedtuple('LookupResult',
@ -188,19 +208,22 @@ def lookup(input, session=None, indices=None, exact_only=False):
name = unicode(input).lower() name = unicode(input).lower()
exact = True exact = True
# If the input provided is a number, match it as an id. Otherwise, name # If the input provided is a number, match it as an id. Otherwise, name.
if rx_is_number.match(input): # Term objects do an exact match, so we don't have to worry about a query
query_column = 'row_id' # parser tripping on weird characters in the input
exact_only = True # don't spell-check numbers! if rx_is_number.match(name):
# Don't spell-check numbers!
exact_only = True
query = whoosh.query.Term(u'row_id', name)
else: else:
# Not an integer # Not an integer
query_column = 'name' query = whoosh.query.Term(u'name', name)
# Look for exact name. A Term object does an exact match, so we don't have ### Actual searching
# to worry about a query parser tripping on weird characters in the input
searcher = index.searcher() searcher = index.searcher()
query = whoosh.query.Term(query_column, name) searcher.weighting = LanguageWeighting() # XXX kosher? docs say search()
print query # takes a weighting kw but it
# certainly does not
results = searcher.search(query) results = searcher.search(query)
# Look for some fuzzy matches if necessary # Look for some fuzzy matches if necessary
@ -217,12 +240,6 @@ def lookup(input, session=None, indices=None, exact_only=False):
seen = {} seen = {}
for result in results: for result in results:
# Skip dupe results # Skip dupe results
# Note! The speller prefers English names, but the query does not. So
# "latias" comes over "ratiasu". "latias" matches only the English
# row, comes out first, and all is well.
# However! The speller could then return "foo" which happens to be the
# name for two different things in different languages, and the
# non-English one could appear preferred. This is not very likely.
seen_key = result['table'], result['row_id'] seen_key = result['table'], result['row_id']
if seen_key in seen: if seen_key in seen:
continue continue