Merge remote-tracking branch 'origin/encukou'

2024-08-20 18:16:34 +00:00 · 2011-03-29 08:06:34 -07:00 · 2011-03-29 08:06:34 -07:00 · 4445305e7c
commit 4445305e7c
parent c91da22989 c25db1d2cf
4 changed files with 298 additions and 165 deletions
--- a/pokedex/init.py
+++ b/pokedex/init.py
@ -122,6 +122,8 @@ def command_load(*args):
    parser = get_parser(verbose=True)
    parser.add_option('-d', '--directory', dest='directory', default=None)
    parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
+    parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
+        help="Do not use backend-specific optimalizations.")
    options, tables = parser.parse_args(list(args))

    if not options.engine_uri:
@ -138,7 +140,7 @@ def command_load(*args):
                                  drop_tables=options.drop_tables,
                                  tables=tables,
                                  verbose=options.verbose,
-                                  safe=False)
+                                  safe=options.safe)

 def command_reindex(*args):
    parser = get_parser(verbose=True)
--- a/pokedex/db/load.py
+++ b/pokedex/db/load.py
@ -140,12 +140,16 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
    # Drop all tables if requested
    if drop_tables:
        print_start('Dropping tables')
-        for table in reversed(table_objs):
+        for n, table in enumerate(reversed(table_objs)):
            table.drop(checkfirst=True)
+            print_status('%s/%s' % (n, len(table_objs)))
        print_done()

-    for table in table_objs:
+    print_start('Creating tables')
+    for n, table in enumerate(table_objs):
        table.create()
+        print_status('%s/%s' % (n, len(table_objs)))
+    print_done()
    connection = session.connection()

    # Okay, run through the tables and actually load the data now
@ -168,6 +172,36 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
        reader = csv.reader(csvfile, lineterminator='\n')
        column_names = [unicode(column) for column in reader.next()]

+        if not safe and session.connection().dialect.name == 'postgresql':
+            """
+            Postgres' CSV dialect is nearly the same as ours, except that it
+            treats completely empty values as NULL, and empty quoted
+            strings ("") as an empty strings.
+            Pokedex dump does not quote empty strings. So, both empty strings
+            and NULLs are read in as NULL.
+            For an empty string in a NOT NULL column, the load will fail, and
+            load will fall back to the cross-backend row-by-row loading. And in
+            nullable columns, we already load empty stings as NULL.
+            """
+            session.commit()
+            not_null_cols = [c for c in column_names if not table_obj.c[c].nullable]
+            if not_null_cols:
+                force_not_null = 'FORCE NOT NULL ' + ','.join('"%s"' % c for c in not_null_cols)
+            else:
+                force_not_null = ''
+            command = "COPY {table_name} ({columns}) FROM '{csvpath}' CSV HEADER {force_not_null}"
+            session.connection().execute(
+                    command.format(
+                            table_name=table_name,
+                            csvpath=csvpath,
+                            columns=','.join('"%s"' % c for c in column_names),
+                            force_not_null=force_not_null,
+                        )
+                )
+            session.commit()
+            print_done()
+            continue
+
        # Self-referential tables may contain rows with foreign keys of other
        # rows in the same table that do not yet exist.  Pull these out and add
        # them to the session last
--- a/pokedex/roomaji.py
+++ b/pokedex/roomaji.py
@ -1,7 +1,140 @@
 # encoding: utf8
-"""Provides `romanize()` for romanizing simple Japanese text."""
+"""Provides `romanize()` for romanizing simple Japanese text.

-_roomaji_kana = {
+Also provides available romanizers in a dictionary keyed by language identifier.
+"""
+
+class Romanizer(object):
+    def __init__(self, parent=None, **tables):
+        """Create a Romanizer
+
+        parent: A LookupTables to base this one on
+        tables: Dicts that become the object's attributes. If a parent is given,
+            its tables are used, and updated with the given ones
+        """
+        self.parent = parent
+        if parent:
+            self.tables = parent.tables
+            for name, table in tables.items():
+                # Take a copy -- don't want to clobber the parent's tables
+                self.tables[name] = dict(self.tables[name])
+                self.tables[name].update(table)
+        else:
+            self.tables = tables
+
+        for name, table in self.tables.items():
+            setattr(self, name, table)
+
+    def romanize(self, string):
+        """Convert a string of kana to roomaji."""
+
+        vowels = ['a', 'e', 'i', 'o', 'u', 'y']
+
+        characters = []
+        last_kana = None  # Used for ー; っ or ッ; ん or ン
+        last_char = None  # Used for small kana combos
+        for char in string:
+            # Full-width Latin
+            if 0xff01 <= ord(char) <= 0xff5e:
+                if last_kana == 'sokuon':
+                    raise ValueError("Sokuon cannot precede Latin characters.")
+
+                # XXX Real Unicode decomposition would be nicer
+                char = chr(ord(char) - 0xff01 + 0x21)
+                characters.append(char)
+
+                last_kana = None
+
+            # Small vowel kana
+            elif char in self.roomaji_small_kana:
+                combo = last_char + char
+                if combo in self.roomaji_small_kana_combos:
+                    characters[-1] = self.roomaji_small_kana_combos[combo]
+
+                else:
+                    # If we don't know what it is...  act dumb and treat it as a
+                    # full-size vowel.  Better than bailing, and seems to occur a
+                    # lot, e.g. ピィ is "pii"
+                    characters.append(self.roomaji_small_kana[char])
+
+                last_kana = self.roomaji_small_kana[char]
+
+            # Youon
+            elif char in self.roomaji_youon:
+                if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
+                    raise ValueError("Youon must follow an -i sound.")
+
+                # Drop the -i and append the ya/yu/yo sound
+                new_sound = self.roomaji_youon[char]
+                if last_kana in self.y_drop:
+                    # Strip the y-
+                    new_char = self.y_drop[last_kana] + new_sound[1:]
+                else:
+                    new_char = last_kana[:-1] + new_sound
+
+                characters[-1] = new_char
+                last_kana = new_char
+
+            # Sokuon
+            elif char in (u'っ', u'ッ'):
+                # Remember it and double the consonant next time around
+                last_kana = 'sokuon'
+
+            # Extended vowel or n
+            elif char == u'ー':
+                if last_kana[-1] not in vowels:
+                    raise ValueError(u"'ー' must follow by a vowel.")
+                if last_kana[-1] in self.lengthened_vowels:
+                    characters[-1] = characters[-1][:-1]
+                    characters.append(self.lengthened_vowels[last_kana[-1]])
+                else:
+                    characters.append(last_kana[-1])
+
+                last_kana = None
+
+            # Regular ol' kana
+            elif char in self.roomaji_kana:
+                kana = self.roomaji_kana[char]
+
+                if last_kana == 'sokuon':
+                    if kana[0] in vowels:
+                        raise ValueError("Sokuon cannot precede a vowel.")
+
+                    characters.append(kana[0])
+                elif last_kana == 'n' and kana[0] in vowels:
+                    characters.append("'")
+
+                # Special characters fo doubled kana
+                if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
+                    kana = self.lengthened_vowels[kana[0]]
+                    characters[-1] = characters[-1][:-1]
+
+                characters.append(kana)
+
+                last_kana = kana
+
+            # Not Japanese?
+            else:
+                if last_kana == 'sokuon':
+                    raise ValueError("Sokuon must be followed by another kana.")
+
+                characters.append(char)
+
+                last_kana = None
+
+            last_char = char
+
+
+        if last_kana == 'sokuon':
+            raise ValueError("Sokuon cannot be the last character.")
+
+        return unicode(''.join(characters))
+
+
+romanizers = dict()
+
+romanizers['en'] = Romanizer(
+    roomaji_kana={
        # Hiragana
        u'あ': 'a',     u'い': 'i',     u'う': 'u',     u'え': 'e',     u'お': 'o',
        u'か': 'ka',    u'き': 'ki',    u'く': 'ku',    u'け': 'ke',    u'こ': 'ko',
@ -38,24 +171,24 @@ _roomaji_kana = {
        u'バ': 'ba',    u'ビ': 'bi',    u'ブ': 'bu',    u'ベ': 'be',    u'ボ': 'bo',
        u'パ': 'pa',    u'ピ': 'pi',    u'プ': 'pu',    u'ペ': 'pe',    u'ポ': 'po',
                                        u'ヴ': 'vu',
-}
+    },

-_roomaji_youon = {
+    roomaji_youon={
        # Hiragana
        u'ゃ': 'ya',                    u'ゅ': 'yu',                    u'ょ': 'yo',

        # Katakana
        u'ャ': 'ya',                    u'ュ': 'yu',                    u'ョ': 'yo',
-}
+    },

    # XXX If romanize() ever handles hiragana, it will need to make sure that the
    # preceding character was a katakana
    # This does not include every small kana combination, but should include every
    # one used in a Pokémon name.  An exhaustive list would be..  very long
-_roomaji_small_kana = {
+    roomaji_small_kana={
        u'ァ': 'a',     u'ィ': 'i',     u'ゥ': 'u',     u'ェ': 'e',     u'ォ': 'o',
-}
-_roomaji_small_kana_combos = {
+    },
+    roomaji_small_kana_combos={
        # These are, by the way, fairly arbitrary.  "shi xi" to mean "sy" is
        # particularly weird, but it seems to be what GF intends

@ -74,100 +207,38 @@ _roomaji_small_kana_combos = {
        u'ミィ': 'my',
        u'ビィ': 'by',
        u'ピィ': 'py',
-}
+    },
+    lengthened_vowels={},
+    y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
+)

-def romanize(string):
-    """Converts a string of kana to roomaji."""
+romanizers['cs'] = Romanizer(parent=romanizers['en'],
+    roomaji_kana={
+        u'し': u'ši', u'ち': u'či', u'つ': u'cu',
+        u'や': u'ja', u'ゆ': u'ju', u'よ': u'jo',
+        u'じ': u'dži', u'ぢ': u'dži',
+        u'シ': u'ši', u'チ': u'či', u'ツ': u'cu',
+        u'ヤ': u'ja', u'ユ': u'ju', u'ヨ': 'jo',
+        u'ジ': u'dži', u'ヂ': u'dži',
+    },
+    roomaji_youon={
+        u'ゃ': 'ja', u'ゅ': 'ju', u'ょ': 'jo',
+        u'ャ': 'ja', u'ュ': 'ju', u'ョ': 'jo',
+    },
+    roomaji_small_kana_combos={
+        u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
+        u'テェ': u'tje', u'デェ': u'dje',
+        u'シィ': u'sí', u'ミィ': u'mí', u'ビィ': u'bí', u'ピィ': u'pí',
+    },
+    lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
+    y_drop={u'či': u'č', u'ši': u'š', u'dži': u'dž', u'ni': u'ňj'},
+)

-    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
+def romanize(string, lang='en'):
+    """Convert a string of kana to roomaji."""

-    characters = []
-    last_kana = None  # Used for ー; っ or ッ; ん or ン
-    last_char = None  # Used for small kana combos
-    for char in string:
-        # Full-width Latin
-        if 0xff01 <= ord(char) <= 0xff5e:
-            if last_kana == 'sokuon':
-                raise ValueError("Sokuon cannot precede Latin characters.")
+    # Get the correct romanizer; fall back to English
+    romanizer = romanizers.get(lang, 'en')

-            # XXX Real Unicode decomposition would be nicer
-            char = chr(ord(char) - 0xff01 + 0x21)
-            characters.append(char)
-
-            last_kana = None
-
-        # Small vowel kana
-        elif char in _roomaji_small_kana:
-            combo = last_char + char
-            if combo in _roomaji_small_kana_combos:
-                characters[-1] = _roomaji_small_kana_combos[combo]
-
-            else:
-                # If we don't know what it is...  act dumb and treat it as a
-                # full-size vowel.  Better than bailing, and seems to occur a
-                # lot, e.g. ピィ is "pii"
-                characters.append(_roomaji_small_kana[char])
-
-            last_kana = _roomaji_small_kana[char]
-
-        # Youon
-        elif char in _roomaji_youon:
-            if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
-                raise ValueError("Youon must follow an -i sound.")
-
-            # Drop the -i and append the ya/yu/yo sound
-            new_sound = _roomaji_youon[char]
-            if last_kana in ['chi', 'shi', 'ji']:
-                # Strip the y-
-                new_char = last_kana[:-1] + new_sound[1:]
-            else:
-                new_char = last_kana[:-1] + new_sound
-
-            characters[-1] = new_char
-            last_kana = new_char
-
-        # Sokuon
-        elif char in (u'っ', u'ッ'):
-            # Remember it and double the consonant next time around
-            last_kana = 'sokuon'
-
-        # Extended vowel or n
-        elif char == u'ー':
-            if last_kana[-1] not in vowels:
-                raise ValueError(u"'ー' must follow by a vowel.")
-            characters.append(last_kana[-1])
-
-            last_kana = None
-
-        # Regular ol' kana
-        elif char in _roomaji_kana:
-            kana = _roomaji_kana[char]
-
-            if last_kana == 'sokuon':
-                if kana[0] in vowels:
-                    raise ValueError("Sokuon cannot precede a vowel.")
-
-                characters.append(kana[0])
-            elif last_kana == 'n' and kana[0] in vowels:
-                characters.append("'")
-
-            characters.append(kana)
-
-            last_kana = kana
-
-        # Not Japanese?
-        else:
-            if last_kana == 'sokuon':
-                raise ValueError("Sokuon must be followed by another kana.")
-
-            characters.append(char)
-
-            last_kana = None
-
-        last_char = char
-
-
-    if last_kana == 'sokuon':
-        raise ValueError("Sokuon cannot be the last character.")
-
-    return unicode(''.join(characters))
+    # Romanize away!
+    return romanizer.romanize(string)
--- a/pokedex/tests/test_roomaji.py
+++ b/pokedex/tests/test_roomaji.py
@ -12,6 +12,7 @@ def test_roomaji():
        # Elongated vowel
        (u'イーブイ',           'iibui'),
        (u'ホーホー',           'hoohoo'),
+        (u'ピカチュウ',         u'pikachuu'),

        # Combined characters
        (u'ニャース',           'nyaasu'),
@ -28,3 +29,28 @@ def test_roomaji():
    for kana, roomaji in tests:
        result = pokedex.roomaji.romanize(kana)
        assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji)
+
+def test_roomaji_cs():
+    tests = [
+        (u'ヤミカラス',         u'jamikarasu'),
+
+        # Elongated vowel
+        (u'イーブイ',           u'íbui'),
+        (u'ホーホー',           u'hóhó'),
+        (u'ピカチュウ',         u'pikačú'),
+
+        # Combined characters
+        (u'ニャース',           u'ňjásu'),
+        (u'ジャ',              u'dža'),
+        (u'ぎゃくてん',         u'gjakuten'),
+        (u'ウェザーボール',     u'wezábóru'),
+
+        # Special katakana combinations
+        (u'ラティアス',         u'ratiasu'),
+        (u'ウィー',             u'wí'),
+        (u'セレビィ',           u'serebí'),
+    ]
+
+    for kana, roomaji in tests:
+        result = pokedex.roomaji.romanize(kana, 'cs')
+        assert_equal(result, roomaji, u"'%s' romanizes correctly for Czech" % roomaji)