Merge remote-tracking branch 'origin/encukou'

This commit is contained in:
Eevee 2011-03-29 08:06:34 -07:00
commit 4445305e7c
4 changed files with 298 additions and 165 deletions

View file

@ -122,6 +122,8 @@ def command_load(*args):
parser = get_parser(verbose=True) parser = get_parser(verbose=True)
parser.add_option('-d', '--directory', dest='directory', default=None) parser.add_option('-d', '--directory', dest='directory', default=None)
parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true') parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
help="Do not use backend-specific optimalizations.")
options, tables = parser.parse_args(list(args)) options, tables = parser.parse_args(list(args))
if not options.engine_uri: if not options.engine_uri:
@ -138,7 +140,7 @@ def command_load(*args):
drop_tables=options.drop_tables, drop_tables=options.drop_tables,
tables=tables, tables=tables,
verbose=options.verbose, verbose=options.verbose,
safe=False) safe=options.safe)
def command_reindex(*args): def command_reindex(*args):
parser = get_parser(verbose=True) parser = get_parser(verbose=True)

View file

@ -140,12 +140,16 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
# Drop all tables if requested # Drop all tables if requested
if drop_tables: if drop_tables:
print_start('Dropping tables') print_start('Dropping tables')
for table in reversed(table_objs): for n, table in enumerate(reversed(table_objs)):
table.drop(checkfirst=True) table.drop(checkfirst=True)
print_status('%s/%s' % (n, len(table_objs)))
print_done() print_done()
for table in table_objs: print_start('Creating tables')
for n, table in enumerate(table_objs):
table.create() table.create()
print_status('%s/%s' % (n, len(table_objs)))
print_done()
connection = session.connection() connection = session.connection()
# Okay, run through the tables and actually load the data now # Okay, run through the tables and actually load the data now
@ -168,6 +172,36 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
reader = csv.reader(csvfile, lineterminator='\n') reader = csv.reader(csvfile, lineterminator='\n')
column_names = [unicode(column) for column in reader.next()] column_names = [unicode(column) for column in reader.next()]
if not safe and session.connection().dialect.name == 'postgresql':
"""
Postgres' CSV dialect is nearly the same as ours, except that it
treats completely empty values as NULL, and empty quoted
strings ("") as an empty strings.
Pokedex dump does not quote empty strings. So, both empty strings
and NULLs are read in as NULL.
For an empty string in a NOT NULL column, the load will fail, and
load will fall back to the cross-backend row-by-row loading. And in
nullable columns, we already load empty stings as NULL.
"""
session.commit()
not_null_cols = [c for c in column_names if not table_obj.c[c].nullable]
if not_null_cols:
force_not_null = 'FORCE NOT NULL ' + ','.join('"%s"' % c for c in not_null_cols)
else:
force_not_null = ''
command = "COPY {table_name} ({columns}) FROM '{csvpath}' CSV HEADER {force_not_null}"
session.connection().execute(
command.format(
table_name=table_name,
csvpath=csvpath,
columns=','.join('"%s"' % c for c in column_names),
force_not_null=force_not_null,
)
)
session.commit()
print_done()
continue
# Self-referential tables may contain rows with foreign keys of other # Self-referential tables may contain rows with foreign keys of other
# rows in the same table that do not yet exist. Pull these out and add # rows in the same table that do not yet exist. Pull these out and add
# them to the session last # them to the session last

View file

@ -1,7 +1,140 @@
# encoding: utf8 # encoding: utf8
"""Provides `romanize()` for romanizing simple Japanese text.""" """Provides `romanize()` for romanizing simple Japanese text.
_roomaji_kana = { Also provides available romanizers in a dictionary keyed by language identifier.
"""
class Romanizer(object):
def __init__(self, parent=None, **tables):
"""Create a Romanizer
parent: A LookupTables to base this one on
tables: Dicts that become the object's attributes. If a parent is given,
its tables are used, and updated with the given ones
"""
self.parent = parent
if parent:
self.tables = parent.tables
for name, table in tables.items():
# Take a copy -- don't want to clobber the parent's tables
self.tables[name] = dict(self.tables[name])
self.tables[name].update(table)
else:
self.tables = tables
for name, table in self.tables.items():
setattr(self, name, table)
def romanize(self, string):
"""Convert a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in self.roomaji_small_kana:
combo = last_char + char
if combo in self.roomaji_small_kana_combos:
characters[-1] = self.roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(self.roomaji_small_kana[char])
last_kana = self.roomaji_small_kana[char]
# Youon
elif char in self.roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = self.roomaji_youon[char]
if last_kana in self.y_drop:
# Strip the y-
new_char = self.y_drop[last_kana] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
if last_kana[-1] in self.lengthened_vowels:
characters[-1] = characters[-1][:-1]
characters.append(self.lengthened_vowels[last_kana[-1]])
else:
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in self.roomaji_kana:
kana = self.roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
# Special characters fo doubled kana
if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
kana = self.lengthened_vowels[kana[0]]
characters[-1] = characters[-1][:-1]
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))
romanizers = dict()
romanizers['en'] = Romanizer(
roomaji_kana={
# Hiragana # Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o', u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko', u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
@ -38,24 +171,24 @@ _roomaji_kana = {
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo', u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po', u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
u'': 'vu', u'': 'vu',
} },
_roomaji_youon = { roomaji_youon={
# Hiragana # Hiragana
u'': 'ya', u'': 'yu', u'': 'yo', u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana # Katakana
u'': 'ya', u'': 'yu', u'': 'yo', u'': 'ya', u'': 'yu', u'': 'yo',
} },
# XXX If romanize() ever handles hiragana, it will need to make sure that the # XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana # preceding character was a katakana
# This does not include every small kana combination, but should include every # This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long # one used in a Pokémon name. An exhaustive list would be.. very long
_roomaji_small_kana = { roomaji_small_kana={
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o', u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
} },
_roomaji_small_kana_combos = { roomaji_small_kana_combos={
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is # These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
# particularly weird, but it seems to be what GF intends # particularly weird, but it seems to be what GF intends
@ -74,100 +207,38 @@ _roomaji_small_kana_combos = {
u'ミィ': 'my', u'ミィ': 'my',
u'ビィ': 'by', u'ビィ': 'by',
u'ピィ': 'py', u'ピィ': 'py',
} },
lengthened_vowels={},
y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
)
def romanize(string): romanizers['cs'] = Romanizer(parent=romanizers['en'],
"""Converts a string of kana to roomaji.""" roomaji_kana={
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': u'jo',
u'': u'dži', u'': u'dži',
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': 'jo',
u'': u'dži', u'': u'dži',
},
roomaji_youon={
u'': 'ja', u'': 'ju', u'': 'jo',
u'': 'ja', u'': 'ju', u'': 'jo',
},
roomaji_small_kana_combos={
u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
u'テェ': u'tje', u'デェ': u'dje',
u'シィ': u'', u'ミィ': u'', u'ビィ': u'', u'ピィ': u'',
},
lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
y_drop={u'či': u'č', u'ši': u'š', u'dži': u'', u'ni': u'ňj'},
)
vowels = ['a', 'e', 'i', 'o', 'u', 'y'] def romanize(string, lang='en'):
"""Convert a string of kana to roomaji."""
characters = [] # Get the correct romanizer; fall back to English
last_kana = None # Used for ー; っ or ッ; ん or ン romanizer = romanizers.get(lang, 'en')
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer # Romanize away!
char = chr(ord(char) - 0xff01 + 0x21) return romanizer.romanize(string)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in _roomaji_small_kana:
combo = last_char + char
if combo in _roomaji_small_kana_combos:
characters[-1] = _roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(_roomaji_small_kana[char])
last_kana = _roomaji_small_kana[char]
# Youon
elif char in _roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = _roomaji_youon[char]
if last_kana in ['chi', 'shi', 'ji']:
# Strip the y-
new_char = last_kana[:-1] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in _roomaji_kana:
kana = _roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))

View file

@ -12,6 +12,7 @@ def test_roomaji():
# Elongated vowel # Elongated vowel
(u'イーブイ', 'iibui'), (u'イーブイ', 'iibui'),
(u'ホーホー', 'hoohoo'), (u'ホーホー', 'hoohoo'),
(u'ピカチュウ', u'pikachuu'),
# Combined characters # Combined characters
(u'ニャース', 'nyaasu'), (u'ニャース', 'nyaasu'),
@ -28,3 +29,28 @@ def test_roomaji():
for kana, roomaji in tests: for kana, roomaji in tests:
result = pokedex.roomaji.romanize(kana) result = pokedex.roomaji.romanize(kana)
assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji) assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji)
def test_roomaji_cs():
tests = [
(u'ヤミカラス', u'jamikarasu'),
# Elongated vowel
(u'イーブイ', u'íbui'),
(u'ホーホー', u'hóhó'),
(u'ピカチュウ', u'pikačú'),
# Combined characters
(u'ニャース', u'ňjásu'),
(u'ジャ', u'dža'),
(u'ぎゃくてん', u'gjakuten'),
(u'ウェザーボール', u'wezábóru'),
# Special katakana combinations
(u'ラティアス', u'ratiasu'),
(u'ウィー', u''),
(u'セレビィ', u'serebí'),
]
for kana, roomaji in tests:
result = pokedex.roomaji.romanize(kana, 'cs')
assert_equal(result, roomaji, u"'%s' romanizes correctly for Czech" % roomaji)