mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Finally weight lookup results by language. #15
This commit is contained in:
parent
9a20969336
commit
a9a88ec3e4
1 changed files with 32 additions and 15 deletions
|
@ -10,12 +10,15 @@ import whoosh.filedb.filestore
|
||||||
import whoosh.filedb.fileindex
|
import whoosh.filedb.fileindex
|
||||||
import whoosh.index
|
import whoosh.index
|
||||||
from whoosh.qparser import QueryParser
|
from whoosh.qparser import QueryParser
|
||||||
|
import whoosh.scoring
|
||||||
import whoosh.spelling
|
import whoosh.spelling
|
||||||
|
|
||||||
from pokedex.db import connect
|
from pokedex.db import connect
|
||||||
import pokedex.db.tables as tables
|
import pokedex.db.tables as tables
|
||||||
from pokedex.roomaji import romanize
|
from pokedex.roomaji import romanize
|
||||||
|
|
||||||
|
__all__ = ['open_index', 'lookup']
|
||||||
|
|
||||||
# Dictionary of table name => table class.
|
# Dictionary of table name => table class.
|
||||||
# Need the table name so we can get the class from the table name after we
|
# Need the table name so we can get the class from the table name after we
|
||||||
# retrieve something from the index
|
# retrieve something from the index
|
||||||
|
@ -135,6 +138,23 @@ def open_index(directory=None, session=None, recreate=False):
|
||||||
return index, speller
|
return index, speller
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageWeighting(whoosh.scoring.Weighting):
|
||||||
|
"""A scoring class that forces otherwise-equal English results to come
|
||||||
|
before foreign results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
|
||||||
|
doc = searcher.stored_fields(docnum)
|
||||||
|
if doc['language'] == None:
|
||||||
|
# English (well, "default"); leave it at 1
|
||||||
|
return weight
|
||||||
|
elif doc['language'] == u'Roomaji':
|
||||||
|
# Give Roomaji a bit of a boost, as it's most likely to be searched
|
||||||
|
return weight * 0.95
|
||||||
|
else:
|
||||||
|
# Everything else can drop down the totem pole
|
||||||
|
return weight * 0.9
|
||||||
|
|
||||||
rx_is_number = re.compile('^\d+$')
|
rx_is_number = re.compile('^\d+$')
|
||||||
|
|
||||||
LookupResult = namedtuple('LookupResult',
|
LookupResult = namedtuple('LookupResult',
|
||||||
|
@ -188,19 +208,22 @@ def lookup(input, session=None, indices=None, exact_only=False):
|
||||||
name = unicode(input).lower()
|
name = unicode(input).lower()
|
||||||
exact = True
|
exact = True
|
||||||
|
|
||||||
# If the input provided is a number, match it as an id. Otherwise, name
|
# If the input provided is a number, match it as an id. Otherwise, name.
|
||||||
if rx_is_number.match(input):
|
# Term objects do an exact match, so we don't have to worry about a query
|
||||||
query_column = 'row_id'
|
# parser tripping on weird characters in the input
|
||||||
exact_only = True # don't spell-check numbers!
|
if rx_is_number.match(name):
|
||||||
|
# Don't spell-check numbers!
|
||||||
|
exact_only = True
|
||||||
|
query = whoosh.query.Term(u'row_id', name)
|
||||||
else:
|
else:
|
||||||
# Not an integer
|
# Not an integer
|
||||||
query_column = 'name'
|
query = whoosh.query.Term(u'name', name)
|
||||||
|
|
||||||
# Look for exact name. A Term object does an exact match, so we don't have
|
### Actual searching
|
||||||
# to worry about a query parser tripping on weird characters in the input
|
|
||||||
searcher = index.searcher()
|
searcher = index.searcher()
|
||||||
query = whoosh.query.Term(query_column, name)
|
searcher.weighting = LanguageWeighting() # XXX kosher? docs say search()
|
||||||
print query
|
# takes a weighting kw but it
|
||||||
|
# certainly does not
|
||||||
results = searcher.search(query)
|
results = searcher.search(query)
|
||||||
|
|
||||||
# Look for some fuzzy matches if necessary
|
# Look for some fuzzy matches if necessary
|
||||||
|
@ -217,12 +240,6 @@ def lookup(input, session=None, indices=None, exact_only=False):
|
||||||
seen = {}
|
seen = {}
|
||||||
for result in results:
|
for result in results:
|
||||||
# Skip dupe results
|
# Skip dupe results
|
||||||
# Note! The speller prefers English names, but the query does not. So
|
|
||||||
# "latias" comes over "ratiasu". "latias" matches only the English
|
|
||||||
# row, comes out first, and all is well.
|
|
||||||
# However! The speller could then return "foo" which happens to be the
|
|
||||||
# name for two different things in different languages, and the
|
|
||||||
# non-English one could appear preferred. This is not very likely.
|
|
||||||
seen_key = result['table'], result['row_id']
|
seen_key = result['table'], result['row_id']
|
||||||
if seen_key in seen:
|
if seen_key in seen:
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in a new issue