Add Czech romanization

This commit is contained in:
Petr Viktorin 2011-01-26 02:14:42 +01:00
parent 3475c86d2e
commit c25db1d2cf
2 changed files with 259 additions and 162 deletions

View file

@ -1,7 +1,140 @@
# encoding: utf8
"""Provides `romanize()` for romanizing simple Japanese text."""
"""Provides `romanize()` for romanizing simple Japanese text.
_roomaji_kana = {
Also provides available romanizers in a dictionary keyed by language identifier.
"""
class Romanizer(object):
def __init__(self, parent=None, **tables):
"""Create a Romanizer
parent: A LookupTables to base this one on
tables: Dicts that become the object's attributes. If a parent is given,
its tables are used, and updated with the given ones
"""
self.parent = parent
if parent:
self.tables = parent.tables
for name, table in tables.items():
# Take a copy -- don't want to clobber the parent's tables
self.tables[name] = dict(self.tables[name])
self.tables[name].update(table)
else:
self.tables = tables
for name, table in self.tables.items():
setattr(self, name, table)
def romanize(self, string):
"""Convert a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in self.roomaji_small_kana:
combo = last_char + char
if combo in self.roomaji_small_kana_combos:
characters[-1] = self.roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(self.roomaji_small_kana[char])
last_kana = self.roomaji_small_kana[char]
# Youon
elif char in self.roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = self.roomaji_youon[char]
if last_kana in self.y_drop:
# Strip the y-
new_char = self.y_drop[last_kana] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
if last_kana[-1] in self.lengthened_vowels:
characters[-1] = characters[-1][:-1]
characters.append(self.lengthened_vowels[last_kana[-1]])
else:
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in self.roomaji_kana:
kana = self.roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
# Special characters fo doubled kana
if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
kana = self.lengthened_vowels[kana[0]]
characters[-1] = characters[-1][:-1]
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))
romanizers = dict()
romanizers['en'] = Romanizer(
roomaji_kana={
# Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
@ -38,24 +171,24 @@ _roomaji_kana = {
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
u'': 'vu',
}
},
_roomaji_youon = {
roomaji_youon={
# Hiragana
u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana
u'': 'ya', u'': 'yu', u'': 'yo',
}
},
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
_roomaji_small_kana = {
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
roomaji_small_kana={
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
}
_roomaji_small_kana_combos = {
},
roomaji_small_kana_combos={
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
# particularly weird, but it seems to be what GF intends
@ -74,100 +207,38 @@ _roomaji_small_kana_combos = {
u'ミィ': 'my',
u'ビィ': 'by',
u'ピィ': 'py',
}
},
lengthened_vowels={},
y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
)
def romanize(string):
"""Converts a string of kana to roomaji."""
romanizers['cs'] = Romanizer(parent=romanizers['en'],
roomaji_kana={
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': u'jo',
u'': u'dži', u'': u'dži',
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': 'jo',
u'': u'dži', u'': u'dži',
},
roomaji_youon={
u'': 'ja', u'': 'ju', u'': 'jo',
u'': 'ja', u'': 'ju', u'': 'jo',
},
roomaji_small_kana_combos={
u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
u'テェ': u'tje', u'デェ': u'dje',
u'シィ': u'', u'ミィ': u'', u'ビィ': u'', u'ピィ': u'',
},
lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
y_drop={u'či': u'č', u'ši': u'š', u'dži': u'', u'ni': u'ňj'},
)
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
def romanize(string, lang='en'):
"""Convert a string of kana to roomaji."""
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# Get the correct romanizer; fall back to English
romanizer = romanizers.get(lang, 'en')
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in _roomaji_small_kana:
combo = last_char + char
if combo in _roomaji_small_kana_combos:
characters[-1] = _roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(_roomaji_small_kana[char])
last_kana = _roomaji_small_kana[char]
# Youon
elif char in _roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = _roomaji_youon[char]
if last_kana in ['chi', 'shi', 'ji']:
# Strip the y-
new_char = last_kana[:-1] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in _roomaji_kana:
kana = _roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))
# Romanize away!
return romanizer.romanize(string)

View file

@ -12,6 +12,7 @@ def test_roomaji():
# Elongated vowel
(u'イーブイ', 'iibui'),
(u'ホーホー', 'hoohoo'),
(u'ピカチュウ', u'pikachuu'),
# Combined characters
(u'ニャース', 'nyaasu'),
@ -28,3 +29,28 @@ def test_roomaji():
for kana, roomaji in tests:
result = pokedex.roomaji.romanize(kana)
assert_equal(result, roomaji, u"'%s' romanizes correctly" % roomaji)
def test_roomaji_cs():
tests = [
(u'ヤミカラス', u'jamikarasu'),
# Elongated vowel
(u'イーブイ', u'íbui'),
(u'ホーホー', u'hóhó'),
(u'ピカチュウ', u'pikačú'),
# Combined characters
(u'ニャース', u'ňjásu'),
(u'ジャ', u'dža'),
(u'ぎゃくてん', u'gjakuten'),
(u'ウェザーボール', u'wezábóru'),
# Special katakana combinations
(u'ラティアス', u'ratiasu'),
(u'ウィー', u''),
(u'セレビィ', u'serebí'),
]
for kana, roomaji in tests:
result = pokedex.roomaji.romanize(kana, 'cs')
assert_equal(result, roomaji, u"'%s' romanizes correctly for Czech" % roomaji)