veekun_pokedex/pokedex/roomaji.py

161 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# encoding: utf8
"""Provides `romanize()` for romanizing simple Japanese text."""
_roomaji_kana = {
# Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
# Katakana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
}
_roomaji_youon = {
# Hiragana
u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana
u'': 'ya', u'': 'yu', u'': 'yo',
}
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
_roomaji_small_kana = {
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
}
_roomaji_small_kana_combos = {
u'ウィ': 'wi',
u'チェ': 'che',
u'シェ': 'she',
u'テァ': 'tha', u'ティ': 'ti', u'テゥ': 'thu', u'テェ': 'tye', u'テォ': 'tho',
u'デァ': 'dha', u'ディ': 'di', u'デゥ': 'dhu', u'デェ': 'dye', u'デォ': 'dho',
u'ファ': 'fa', u'フィ': 'fi', u'ホゥ': 'hu', u'フェ': 'fe', u'フォ': 'fo',
}
def romanize(string):
"""Converts a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if ord(char) >= 0xff11 and ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
char = chr(ord(char) - 0xff11 + 0x31)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in _roomaji_small_kana:
combo = last_char + char
if combo in _roomaji_small_kana_combos:
characters[-1] = _roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(_roomaji_small_kana[char])
last_kana = _roomaji_small_kana[char]
# Youon
elif char in _roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = _roomaji_youon[char]
if last_kana in ['chi', 'shi', 'ji']:
# Strip the y-
new_char = last_kana[:-1] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
#elif char in (u'っ', u'ッ'):
elif char in (u'',):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in _roomaji_kana:
kana = _roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
raise ValueError("Sokuon cannot precede a vowel.")
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return unicode(''.join(characters))