veekun_pokedex/pokedex/roomaji.py

174 lines
7.3 KiB
Python
Raw Normal View History

# encoding: utf8
"""Provides `romanize()` for romanizing simple Japanese text."""
_roomaji_kana = {
# Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
# Katakana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
u'': 'vu',
}
_roomaji_youon = {
# Hiragana
u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana
u'': 'ya', u'': 'yu', u'': 'yo',
}
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
_roomaji_small_kana = {
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
}
_roomaji_small_kana_combos = {
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
# particularly weird, but it seems to be what GF intends
# Simple vowel replacement
u'ウィ': 'wi', u'ウゥ': 'wu', u'ウェ': 'we', u'ウォ': 'wo',
u'ヴァ': 'va', u'ヴィ': 'vi', u'ヴェ': 've', u'ヴォ': 'vo',
u'チェ': 'che',
u'シェ': 'she',
u'ジェ': 'je',
u'テァ': 'tha', u'ティ': 'ti', u'テゥ': 'thu', u'テェ': 'tye', u'テォ': 'tho',
u'デァ': 'dha', u'ディ': 'di', u'デゥ': 'dhu', u'デェ': 'dye', u'デォ': 'dho',
u'ファ': 'fa', u'フィ': 'fi', u'ホゥ': 'hu', u'フェ': 'fe', u'フォ': 'fo',
# Not so much
u'シィ': 'sy',
u'ミィ': 'my',
u'ビィ': 'by',
u'ピィ': 'py',
}
def romanize(string):
"""Converts a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in _roomaji_small_kana:
combo = last_char + char
if combo in _roomaji_small_kana_combos:
characters[-1] = _roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(_roomaji_small_kana[char])
last_kana = _roomaji_small_kana[char]
# Youon
elif char in _roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = _roomaji_youon[char]
2009-09-12 21:49:02 +00:00
if last_kana in ['chi', 'shi', 'ji']:
# Strip the y-
new_char = last_kana[:-1] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in _roomaji_kana:
kana = _roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))