mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Finally weight lookup results by language. #15
This commit is contained in:
parent
9a20969336
commit
a9a88ec3e4
1 changed files with 32 additions and 15 deletions
|
@ -10,12 +10,15 @@ import whoosh.filedb.filestore
|
|||
import whoosh.filedb.fileindex
|
||||
import whoosh.index
|
||||
from whoosh.qparser import QueryParser
|
||||
import whoosh.scoring
|
||||
import whoosh.spelling
|
||||
|
||||
from pokedex.db import connect
|
||||
import pokedex.db.tables as tables
|
||||
from pokedex.roomaji import romanize
|
||||
|
||||
__all__ = ['open_index', 'lookup']
|
||||
|
||||
# Dictionary of table name => table class.
|
||||
# Need the table name so we can get the class from the table name after we
|
||||
# retrieve something from the index
|
||||
|
@ -135,6 +138,23 @@ def open_index(directory=None, session=None, recreate=False):
|
|||
return index, speller
|
||||
|
||||
|
||||
class LanguageWeighting(whoosh.scoring.Weighting):
|
||||
"""A scoring class that forces otherwise-equal English results to come
|
||||
before foreign results.
|
||||
"""
|
||||
|
||||
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
|
||||
doc = searcher.stored_fields(docnum)
|
||||
if doc['language'] == None:
|
||||
# English (well, "default"); leave it at 1
|
||||
return weight
|
||||
elif doc['language'] == u'Roomaji':
|
||||
# Give Roomaji a bit of a boost, as it's most likely to be searched
|
||||
return weight * 0.95
|
||||
else:
|
||||
# Everything else can drop down the totem pole
|
||||
return weight * 0.9
|
||||
|
||||
rx_is_number = re.compile('^\d+$')
|
||||
|
||||
LookupResult = namedtuple('LookupResult',
|
||||
|
@ -188,19 +208,22 @@ def lookup(input, session=None, indices=None, exact_only=False):
|
|||
name = unicode(input).lower()
|
||||
exact = True
|
||||
|
||||
# If the input provided is a number, match it as an id. Otherwise, name
|
||||
if rx_is_number.match(input):
|
||||
query_column = 'row_id'
|
||||
exact_only = True # don't spell-check numbers!
|
||||
# If the input provided is a number, match it as an id. Otherwise, name.
|
||||
# Term objects do an exact match, so we don't have to worry about a query
|
||||
# parser tripping on weird characters in the input
|
||||
if rx_is_number.match(name):
|
||||
# Don't spell-check numbers!
|
||||
exact_only = True
|
||||
query = whoosh.query.Term(u'row_id', name)
|
||||
else:
|
||||
# Not an integer
|
||||
query_column = 'name'
|
||||
query = whoosh.query.Term(u'name', name)
|
||||
|
||||
# Look for exact name. A Term object does an exact match, so we don't have
|
||||
# to worry about a query parser tripping on weird characters in the input
|
||||
### Actual searching
|
||||
searcher = index.searcher()
|
||||
query = whoosh.query.Term(query_column, name)
|
||||
print query
|
||||
searcher.weighting = LanguageWeighting() # XXX kosher? docs say search()
|
||||
# takes a weighting kw but it
|
||||
# certainly does not
|
||||
results = searcher.search(query)
|
||||
|
||||
# Look for some fuzzy matches if necessary
|
||||
|
@ -217,12 +240,6 @@ def lookup(input, session=None, indices=None, exact_only=False):
|
|||
seen = {}
|
||||
for result in results:
|
||||
# Skip dupe results
|
||||
# Note! The speller prefers English names, but the query does not. So
|
||||
# "latias" comes over "ratiasu". "latias" matches only the English
|
||||
# row, comes out first, and all is well.
|
||||
# However! The speller could then return "foo" which happens to be the
|
||||
# name for two different things in different languages, and the
|
||||
# non-English one could appear preferred. This is not very likely.
|
||||
seen_key = result['table'], result['row_id']
|
||||
if seen_key in seen:
|
||||
continue
|
||||
|
|
Loading…
Reference in a new issue