mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Merge remote-tracking branch 'origin/encukou'
This commit is contained in:
commit
4445305e7c
4 changed files with 298 additions and 165 deletions
|
@ -122,6 +122,8 @@ def command_load(*args):
|
||||||
parser = get_parser(verbose=True)
|
parser = get_parser(verbose=True)
|
||||||
parser.add_option('-d', '--directory', dest='directory', default=None)
|
parser.add_option('-d', '--directory', dest='directory', default=None)
|
||||||
parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
|
parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
|
||||||
|
parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
|
||||||
|
help="Do not use backend-specific optimalizations.")
|
||||||
options, tables = parser.parse_args(list(args))
|
options, tables = parser.parse_args(list(args))
|
||||||
|
|
||||||
if not options.engine_uri:
|
if not options.engine_uri:
|
||||||
|
@ -138,7 +140,7 @@ def command_load(*args):
|
||||||
drop_tables=options.drop_tables,
|
drop_tables=options.drop_tables,
|
||||||
tables=tables,
|
tables=tables,
|
||||||
verbose=options.verbose,
|
verbose=options.verbose,
|
||||||
safe=False)
|
safe=options.safe)
|
||||||
|
|
||||||
def command_reindex(*args):
|
def command_reindex(*args):
|
||||||
parser = get_parser(verbose=True)
|
parser = get_parser(verbose=True)
|
||||||
|
|
|
@ -140,12 +140,16 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
|
||||||
# Drop all tables if requested
|
# Drop all tables if requested
|
||||||
if drop_tables:
|
if drop_tables:
|
||||||
print_start('Dropping tables')
|
print_start('Dropping tables')
|
||||||
for table in reversed(table_objs):
|
for n, table in enumerate(reversed(table_objs)):
|
||||||
table.drop(checkfirst=True)
|
table.drop(checkfirst=True)
|
||||||
|
print_status('%s/%s' % (n, len(table_objs)))
|
||||||
print_done()
|
print_done()
|
||||||
|
|
||||||
for table in table_objs:
|
print_start('Creating tables')
|
||||||
|
for n, table in enumerate(table_objs):
|
||||||
table.create()
|
table.create()
|
||||||
|
print_status('%s/%s' % (n, len(table_objs)))
|
||||||
|
print_done()
|
||||||
connection = session.connection()
|
connection = session.connection()
|
||||||
|
|
||||||
# Okay, run through the tables and actually load the data now
|
# Okay, run through the tables and actually load the data now
|
||||||
|
@ -168,6 +172,36 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
|
||||||
reader = csv.reader(csvfile, lineterminator='\n')
|
reader = csv.reader(csvfile, lineterminator='\n')
|
||||||
column_names = [unicode(column) for column in reader.next()]
|
column_names = [unicode(column) for column in reader.next()]
|
||||||
|
|
||||||
|
if not safe and session.connection().dialect.name == 'postgresql':
|
||||||
|
"""
|
||||||
|
Postgres' CSV dialect is nearly the same as ours, except that it
|
||||||
|
treats completely empty values as NULL, and empty quoted
|
||||||
|
strings ("") as an empty strings.
|
||||||
|
Pokedex dump does not quote empty strings. So, both empty strings
|
||||||
|
and NULLs are read in as NULL.
|
||||||
|
For an empty string in a NOT NULL column, the load will fail, and
|
||||||
|
load will fall back to the cross-backend row-by-row loading. And in
|
||||||
|
nullable columns, we already load empty stings as NULL.
|
||||||
|
"""
|
||||||
|
session.commit()
|
||||||
|
not_null_cols = [c for c in column_names if not table_obj.c[c].nullable]
|
||||||
|
if not_null_cols:
|
||||||
|
force_not_null = 'FORCE NOT NULL ' + ','.join('"%s"' % c for c in not_null_cols)
|
||||||
|
else:
|
||||||
|
force_not_null = ''
|
||||||
|
command = "COPY {table_name} ({columns}) FROM '{csvpath}' CSV HEADER {force_not_null}"
|
||||||
|
session.connection().execute(
|
||||||
|
command.format(
|
||||||
|
table_name=table_name,
|
||||||
|
csvpath=csvpath,
|
||||||
|
columns=','.join('"%s"' % c for c in column_names),
|
||||||
|
force_not_null=force_not_null,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
print_done()
|
||||||
|
continue
|
||||||
|
|
||||||
# Self-referential tables may contain rows with foreign keys of other
|
# Self-referential tables may contain rows with foreign keys of other
|
||||||
# rows in the same table that do not yet exist. Pull these out and add
|
# rows in the same table that do not yet exist. Pull these out and add
|
||||||
# them to the session last
|
# them to the session last
|
||||||
|
|
|
@ -1,7 +1,140 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
"""Provides `romanize()` for romanizing simple Japanese text."""
|
"""Provides `romanize()` for romanizing simple Japanese text.
|
||||||
|
|
||||||
_roomaji_kana = {
|
Also provides available romanizers in a dictionary keyed by language identifier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Romanizer(object):
|
||||||
|
def __init__(self, parent=None, **tables):
|
||||||
|
"""Create a Romanizer
|
||||||
|
|
||||||
|
parent: A LookupTables to base this one on
|
||||||
|
tables: Dicts that become the object's attributes. If a parent is given,
|
||||||
|
its tables are used, and updated with the given ones
|
||||||
|
"""
|
||||||
|
self.parent = parent
|
||||||
|
if parent:
|
||||||
|
self.tables = parent.tables
|
||||||
|
for name, table in tables.items():
|
||||||
|
# Take a copy -- don't want to clobber the parent's tables
|
||||||
|
self.tables[name] = dict(self.tables[name])
|
||||||
|
self.tables[name].update(table)
|
||||||
|
else:
|
||||||
|
self.tables = tables
|
||||||
|
|
||||||
|
for name, table in self.tables.items():
|
||||||
|
setattr(self, name, table)
|
||||||
|
|
||||||
|
def romanize(self, string):
|
||||||
|
"""Convert a string of kana to roomaji."""
|
||||||
|
|
||||||
|
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
|
||||||
|
|
||||||
|
characters = []
|
||||||
|
last_kana = None # Used for ー; っ or ッ; ん or ン
|
||||||
|
last_char = None # Used for small kana combos
|
||||||
|
for char in string:
|
||||||
|
# Full-width Latin
|
||||||
|
if 0xff01 <= ord(char) <= 0xff5e:
|
||||||
|
if last_kana == 'sokuon':
|
||||||
|
raise ValueError("Sokuon cannot precede Latin characters.")
|
||||||
|
|
||||||
|
# XXX Real Unicode decomposition would be nicer
|
||||||
|
char = chr(ord(char) - 0xff01 + 0x21)
|
||||||
|
characters.append(char)
|
||||||
|
|
||||||
|
last_kana = None
|
||||||
|
|
||||||
|
# Small vowel kana
|
||||||
|
elif char in self.roomaji_small_kana:
|
||||||
|
combo = last_char + char
|
||||||
|
if combo in self.roomaji_small_kana_combos:
|
||||||
|
characters[-1] = self.roomaji_small_kana_combos[combo]
|
||||||
|
|
||||||
|
else:
|
||||||
|
# If we don't know what it is... act dumb and treat it as a
|
||||||
|
# full-size vowel. Better than bailing, and seems to occur a
|
||||||
|
# lot, e.g. ピィ is "pii"
|
||||||
|
characters.append(self.roomaji_small_kana[char])
|
||||||
|
|
||||||
|
last_kana = self.roomaji_small_kana[char]
|
||||||
|
|
||||||
|
# Youon
|
||||||
|
elif char in self.roomaji_youon:
|
||||||
|
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
|
||||||
|
raise ValueError("Youon must follow an -i sound.")
|
||||||
|
|
||||||
|
# Drop the -i and append the ya/yu/yo sound
|
||||||
|
new_sound = self.roomaji_youon[char]
|
||||||
|
if last_kana in self.y_drop:
|
||||||
|
# Strip the y-
|
||||||
|
new_char = self.y_drop[last_kana] + new_sound[1:]
|
||||||
|
else:
|
||||||
|
new_char = last_kana[:-1] + new_sound
|
||||||
|
|
||||||
|
characters[-1] = new_char
|
||||||
|
last_kana = new_char
|
||||||
|
|
||||||
|
# Sokuon
|
||||||
|
elif char in (u'っ', u'ッ'):
|
||||||
|
# Remember it and double the consonant next time around
|
||||||
|
last_kana = 'sokuon'
|
||||||
|
|
||||||
|
# Extended vowel or n
|
||||||
|
elif char == u'ー':
|
||||||
|
if last_kana[-1] not in vowels:
|
||||||
|
raise ValueError(u"'ー' must follow by a vowel.")
|
||||||
|
if last_kana[-1] in self.lengthened_vowels:
|
||||||
|
characters[-1] = characters[-1][:-1]
|
||||||
|
characters.append(self.lengthened_vowels[last_kana[-1]])
|
||||||
|
else:
|
||||||
|
characters.append(last_kana[-1])
|
||||||
|
|
||||||
|
last_kana = None
|
||||||
|
|
||||||
|
# Regular ol' kana
|
||||||
|
elif char in self.roomaji_kana:
|
||||||
|
kana = self.roomaji_kana[char]
|
||||||
|
|
||||||
|
if last_kana == 'sokuon':
|
||||||
|
if kana[0] in vowels:
|
||||||
|
raise ValueError("Sokuon cannot precede a vowel.")
|
||||||
|
|
||||||
|
characters.append(kana[0])
|
||||||
|
elif last_kana == 'n' and kana[0] in vowels:
|
||||||
|
characters.append("'")
|
||||||
|
|
||||||
|
# Special characters fo doubled kana
|
||||||
|
if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
|
||||||
|
kana = self.lengthened_vowels[kana[0]]
|
||||||
|
characters[-1] = characters[-1][:-1]
|
||||||
|
|
||||||
|
characters.append(kana)
|
||||||
|
|
||||||
|
last_kana = kana
|
||||||
|
|
||||||
|
# Not Japanese?
|
||||||
|
else:
|
||||||
|
if last_kana == 'sokuon':
|
||||||
|
raise ValueError("Sokuon must be followed by another kana.")
|
||||||
|
|
||||||
|
characters.append(char)
|
||||||
|
|
||||||
|
last_kana = None
|
||||||
|
|
||||||
|
last_char = char
|
||||||
|
|
||||||
|
|
||||||
|
if last_kana == 'sokuon':
|
||||||
|
raise ValueError("Sokuon cannot be the last character.")
|
||||||
|
|
||||||
|
return unicode(''.join(characters))
|
||||||
|
|
||||||
|
|
||||||
|
romanizers = dict()
|
||||||
|
|
||||||
|
romanizers['en'] = Romanizer(
|
||||||
|
roomaji_kana={
|
||||||
# Hiragana
|
# Hiragana
|
||||||
u'あ': 'a', u'い': 'i', u'う': 'u', u'え': 'e', u'お': 'o',
|
u'あ': 'a', u'い': 'i', u'う': 'u', u'え': 'e', u'お': 'o',
|
||||||
u'か': 'ka', u'き': 'ki', u'く': 'ku', u'け': 'ke', u'こ': 'ko',
|
u'か': 'ka', u'き': 'ki', u'く': 'ku', u'け': 'ke', u'こ': 'ko',
|
||||||
|
@ -38,24 +171,24 @@ _roomaji_kana = {
|
||||||
u'バ': 'ba', u'ビ': 'bi', u'ブ': 'bu', u'ベ': 'be', u'ボ': 'bo',
|
u'バ': 'ba', u'ビ': 'bi', u'ブ': 'bu', u'ベ': 'be', u'ボ': 'bo',
|
||||||
u'パ': 'pa', u'ピ': 'pi', u'プ': 'pu', u'ペ': 'pe', u'ポ': 'po',
|
u'パ': 'pa', u'ピ': 'pi', u'プ': 'pu', u'ペ': 'pe', u'ポ': 'po',
|
||||||
u'ヴ': 'vu',
|
u'ヴ': 'vu',
|
||||||
}
|
},
|
||||||
|
|
||||||
_roomaji_youon = {
|
roomaji_youon={
|
||||||
# Hiragana
|
# Hiragana
|
||||||
u'ゃ': 'ya', u'ゅ': 'yu', u'ょ': 'yo',
|
u'ゃ': 'ya', u'ゅ': 'yu', u'ょ': 'yo',
|
||||||
|
|
||||||
# Katakana
|
# Katakana
|
||||||
u'ャ': 'ya', u'ュ': 'yu', u'ョ': 'yo',
|
u'ャ': 'ya', u'ュ': 'yu', u'ョ': 'yo',
|
||||||
}
|
},
|
||||||
|
|
||||||
# XXX If romanize() ever handles hiragana, it will need to make sure that the
|
# XXX If romanize() ever handles hiragana, it will need to make sure that the
|
||||||
# preceding character was a katakana
|
# preceding character was a katakana
|
||||||
# This does not include every small kana combination, but should include every
|
# This does not include every small kana combination, but should include every
|
||||||
# one used in a Pokémon name. An exhaustive list would be.. very long
|
# one used in a Pokémon name. An exhaustive list would be.. very long
|
||||||
_roomaji_small_kana = {
|
roomaji_small_kana={
|
||||||
u'ァ': 'a', u'ィ': 'i', u'ゥ': 'u', u'ェ': 'e', u'ォ': 'o',
|
u'ァ': 'a', u'ィ': 'i', u'ゥ': 'u', u'ェ': 'e', u'ォ': 'o',
|
||||||
}
|
},
|
||||||
_roomaji_small_kana_combos = {
|
roomaji_small_kana_combos={
|
||||||
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
|
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
|
||||||
# particularly weird, but it seems to be what GF intends
|
# particularly weird, but it seems to be what GF intends
|
||||||
|
|
||||||
|
@ -74,100 +207,38 @@ _roomaji_small_kana_combos = {
|
||||||
u'ミィ': 'my',
|
u'ミィ': 'my',
|
||||||
u'ビィ': 'by',
|
u'ビィ': 'by',
|
||||||
u'ピィ': 'py',
|
u'ピィ': 'py',
|
||||||
}
|
},
|
||||||
|
lengthened_vowels={},
|
||||||
|
y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
|
||||||
|
)
|
||||||
|
|
||||||
def romanize(string):
|
romanizers['cs'] = Romanizer(parent=romanizers['en'],
|
||||||
"""Converts a string of kana to roomaji."""
|
roomaji_kana={
|
||||||
|
u'し': u'ši', u'ち': u'či', u'つ': u'cu',
|
||||||
|
u'や': u'ja', u'ゆ': u'ju', u'よ': u'jo',
|
||||||
|
u'じ': u'dži', u'ぢ': u'dži',
|
||||||
|
u'シ': u'ši', u'チ': u'či', u'ツ': u'cu',
|
||||||
|
u'ヤ': u'ja', u'ユ': u'ju', u'ヨ': 'jo',
|
||||||
|
u'ジ': u'dži', u'ヂ': u'dži',
|
||||||
|
},
|
||||||
|
roomaji_youon={
|
||||||
|
u'ゃ': 'ja', u'ゅ': 'ju', u'ょ': 'jo',
|
||||||
|
u'ャ': 'ja', u'ュ': 'ju', u'ョ': 'jo',
|
||||||
|
},
|
||||||
|
roomaji_small_kana_combos={
|
||||||
|
u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
|
||||||
|
u'テェ': u'tje', u'デェ': u'dje',
|
||||||
|
u'シィ': u'sí', u'ミィ': u'mí', u'ビィ': u'bí', u'ピィ': u'pí',
|
||||||
|
},
|
||||||
|
lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
|
||||||
|
y_drop={u'či': u'č', u'ši': u'š', u'dži': u'dž', u'ni': u'ňj'},
|
||||||
|
)
|
||||||
|
|
||||||
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
|
def romanize(string, lang='en'):
|
||||||
|
"""Convert a string of kana to roomaji."""
|
||||||
|
|
||||||
characters = []
|
# Get the correct romanizer; fall back to English
|
||||||
last_kana = None # Used for ー; っ or ッ; ん or ン
|
romanizer = romanizers.get(lang, 'en')
|
||||||
last_char = None # Used for small kana combos
|
|
||||||
for char in string:
|
|
||||||
# Full-width Latin
|
|
||||||
if 0xff01 <= ord(char) <= 0xff5e:
|
|
||||||
if last_kana == 'sokuon':
|
|
||||||
raise ValueError("Sokuon cannot precede Latin characters.")
|
|
||||||
|
|
||||||
# XXX Real Unicode decomposition would be nicer
|
# Romanize away!
|
||||||
char = chr(ord(char) - 0xff01 + 0x21)
|
return romanizer.romanize(string)
|
||||||
characters.append(char)
|
|
||||||
|
|
||||||
last_kana = None
|
|
||||||
|
|
||||||
# Small vowel kana
|
|
||||||
elif char in _roomaji_small_kana:
|
|
||||||
combo = last_char + char
|
|
||||||
if combo in _roomaji_small_kana_combos:
|
|
||||||
characters[-1] = _roomaji_small_kana_combos[combo]
|
|
||||||
|
|
||||||
else:
|
|
||||||
# If we don't know what it is... act dumb and treat it as a
|
|
||||||
# full-size vowel. Better than bailing, and seems to occur a
|
|
||||||
# lot, e.g. ピィ is "pii"
|
|
||||||
characters.append(_roomaji_small_kana[char])
|
|
||||||
|
|
||||||
last_kana = _roomaji_small_kana[char]
|
|
||||||
|
|
||||||
# Youon
|
|
||||||
elif char in _roomaji_youon:
|
|
||||||
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
|
|
||||||
raise ValueError("Youon must follow an -i sound.")
|
|
||||||
|
|
||||||
# Drop the -i and append the ya/yu/yo sound
|
|
||||||
new_sound = _roomaji_youon[char]
|
|
||||||
if last_kana in ['chi', 'shi', 'ji']:
|
|
||||||
# Strip the y-
|
|
||||||
new_char = last_kana[:-1] + new_sound[1:]
|
|
||||||
else:
|
|
||||||
new_char = last_kana[:-1] + new_sound
|
|
||||||
|
|
||||||
characters[-1] = new_char
|
|
||||||
last_kana = new_char
|
|
||||||
|
|
||||||
# Sokuon
|
|
||||||
elif char in (u'っ', u'ッ'):
|
|
||||||
# Remember it and double the consonant next time around
|
|
||||||
last_kana = 'sokuon'
|
|
||||||
|
|
||||||
# Extended vowel or n
|
|
||||||
elif char == u'ー':
|
|
||||||
if last_kana[-1] not in vowels:
|
|
||||||
raise ValueError(u"'ー' must follow by a vowel.")
|
|
||||||
characters.append(last_kana[-1])
|
|
||||||
|
|
||||||
last_kana = None
|
|
||||||
|
|
||||||
# Regular ol' kana
|
|
||||||
elif char in _roomaji_kana:
|
|
||||||
kana = _roomaji_kana[char]
|
|
||||||
|
|
||||||
if last_kana == 'sokuon':
|
|
||||||
if kana[0] in vowels:
|
|
||||||
raise ValueError("Sokuon cannot precede a vowel.")
|
|
||||||
|
|
||||||
characters.append(kana[0])
|
|
||||||
elif last_kana == 'n' and kana[0] in vowels:
|
|
||||||
characters.append("'")
|
|
||||||
|
|
||||||
characters.append(kana)
|
|
||||||
|
|
||||||
last_kana = kana
|
|
||||||
|
|
||||||
# Not Japanese?
|
|
||||||
else:
|
|
||||||
if last_kana == 'sokuon':
|
|
||||||
raise ValueError("Sokuon must be followed by another kana.")
|
|
||||||
|
|
||||||
characters.append(char)
|
|
||||||
|
|
||||||
last_kana = None
|
|
||||||
|
|
||||||
last_char = char
|
|
||||||
|
|
||||||
|
|
||||||
if last_kana == 'sokuon':
|
|
||||||
raise ValueError("Sokuon cannot be the last character.")
|
|
||||||
|
|
||||||
return unicode(''.join(characters))
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ def test_roomaji():
|
||||||
# Elongated vowel
|
# Elongated vowel
|
||||||
(u'イーブイ', 'iibui'),
|
(u'イーブイ', 'iibui'),
|
||||||
(u'ホーホー', 'hoohoo'),
|
(u'ホーホー', 'hoohoo'),
|
||||||
|
(u'ピカチュウ', u'pikachuu'),
|
||||||
|
|
||||||
# Combined characters
|
# Combined characters
|
||||||
(u'ニャース', 'nyaasu'),
|
(u'ニャース', 'nyaasu'),
|
||||||
|
@ -28,3 +29,28 @@ def test_roomaji():
|
||||||
for kana, roomaji in tests:
|
for kana, roomaji in tests:
|
||||||
result = pokedex.roomaji.romanize(kana)
|
result = pokedex.roomaji.romanize(kana)
|
||||||
assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji)
|
assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji)
|
||||||
|
|
||||||
|
def test_roomaji_cs():
|
||||||
|
tests = [
|
||||||
|
(u'ヤミカラス', u'jamikarasu'),
|
||||||
|
|
||||||
|
# Elongated vowel
|
||||||
|
(u'イーブイ', u'íbui'),
|
||||||
|
(u'ホーホー', u'hóhó'),
|
||||||
|
(u'ピカチュウ', u'pikačú'),
|
||||||
|
|
||||||
|
# Combined characters
|
||||||
|
(u'ニャース', u'ňjásu'),
|
||||||
|
(u'ジャ', u'dža'),
|
||||||
|
(u'ぎゃくてん', u'gjakuten'),
|
||||||
|
(u'ウェザーボール', u'wezábóru'),
|
||||||
|
|
||||||
|
# Special katakana combinations
|
||||||
|
(u'ラティアス', u'ratiasu'),
|
||||||
|
(u'ウィー', u'wí'),
|
||||||
|
(u'セレビィ', u'serebí'),
|
||||||
|
]
|
||||||
|
|
||||||
|
for kana, roomaji in tests:
|
||||||
|
result = pokedex.roomaji.romanize(kana, 'cs')
|
||||||
|
assert_equal(result, roomaji, u"'%s' romanizes correctly for Czech" % roomaji)
|
||||||
|
|
Loading…
Reference in a new issue