Added support for lookup by other language name. #15

English fuzzy matches are preferred, followed by Roomaji and then everything else. The return tuple from lookup() now has a `name` parameter for the actual name that was matched.
2024-08-20 18:16:34 +00:00 · 2009-08-22 01:13:34 -07:00 · 2009-08-22 01:13:34 -07:00 · 2bc41e2c62
commit 2bc41e2c62
parent 4e51867e95
2 changed files with 46 additions and 24 deletions
--- a/pokedex/init.py
+++ b/pokedex/init.py
@ -66,8 +66,12 @@ def command_lookup(name):
    else:
        print "Fuzzy-matched:"

-    for object, language, exact in results:
-        print object.__tablename__, object.name, language
+    for result in results:
+        print "%s: %s" % (result.object.__tablename__, result.object.name),
+        if result.language:
+            print "(%s in %s)" % (result.name, result.language)
+        else:
+            print


 def command_help():
--- a/pokedex/lookup.py
+++ b/pokedex/lookup.py
@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
    writer = index.writer()

    # Index every name in all our tables of interest
+    # speller_entries becomes a list of (word, score) tuples; the score is 2
+    # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
+    # this biases the results in the direction most people expect, especially
+    # when e.g. German names are very similar to English names
    speller_entries = []
    for cls in indexed_tables.values():
        q = session.query(cls)
@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):

            name = row.name.lower()
            writer.add_document(name=name, **row_key)
-            speller_entries.append(name)
+            speller_entries.append((name, 1))

            for extra_key_func in extra_keys.get(cls, []):
                extra_key = extra_key_func(row)
                writer.add_document(name=extra_key, **row_key)

            # Pokemon also get other languages
-            if cls == tables.Pokemon:
-                for foreign_name in row.foreign_names:
-                    name = foreign_name.name.lower()
-                    writer.add_document(name=name,
+            for foreign_name in getattr(row, 'foreign_names', []):
+                moonspeak = foreign_name.name.lower()
+                if name == moonspeak:
+                    # Don't add the English name again as a different language;
+                    # no point and it makes spell results confusing
+                    continue
+
+                writer.add_document(name=moonspeak,
                                    language=foreign_name.language.name,
                                    **row_key)
-                    speller_entries.append(name)
+                speller_entries.append((moonspeak, 3))

-                    if foreign_name.language.name == 'Japanese':
                # Add Roomaji too
+                if foreign_name.language.name == 'Japanese':
                    roomaji = romanize(foreign_name.name).lower()
-                        writer.add_document(name=roomaji,
-                                            language='Roomaji',
+                    writer.add_document(name=roomaji, language='Roomaji',
                                        **row_key)
-                        speller_entries.append(roomaji)
+                    speller_entries.append((roomaji, 8))


    writer.commit()
@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
    # at once, as every call to add_* does a commit(), and those seem to be
    # expensive
    speller = whoosh.spelling.SpellChecker(index.storage)
-    speller.add_words(speller_entries)
+    speller.add_scored_words(speller_entries)

    return index, speller


-LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
+LookupResult = namedtuple('LookupResult',
+                          ['object', 'name', 'language', 'exact'])
 def lookup(name, session=None, indices=None, exact_only=False):
    """Attempts to find some sort of object, given a database session and name.

-    Returns a list of named (object, language, exact) tuples.  `object` is a
-    database object, `language` is the name of the language in which the name
-    was found, and `exact` is True iff this was an exact match.
+    Returns a list of named (object, name, language, exact) tuples.  `object`
+    is a database object, `name` is the name under which the object was found,
+    `language` is the name of the language in which the name was found, and
+    `exact` is True iff this was an exact match.

    This function currently ONLY does fuzzy matching if there are no exact
    matches.
@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
    seen = {}
    for result in results:
        # Skip dupe results
+        # Note!  The speller prefers English names, but the query does not.  So
+        # "latias" comes over "ratiasu".  "latias" matches only the English
+        # row, comes out first, and all is well.
+        # However!  The speller could then return "foo" which happens to be the
+        # name for two different things in different languages, and the
+        # non-English one could appear preferred.  This is not very likely.
        seen_key = result['table'], result['row_id']
        if seen_key in seen:
            continue
@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):

        cls = indexed_tables[result['table']]
        obj = session.query(cls).get(result['row_id'])
-        objects.append(LookupResult(obj, result['language'], exact))
+        objects.append(LookupResult(object=obj,
+                                    name=result['name'],
+                                    language=result['language'],
+                                    exact=exact))

-    return objects
+    return objects[:5]