veekun_pokedex/pokedex/roomaji.py
Andrew Ekstedt a91a7d95b3 Allow small ッ before a vowel
This is a nonstandard use of ッ and it doesn't really have a defined
romanization, but we need to support it for Cramorant (ウッウ, U'u).
2021-03-07 11:26:50 -08:00

244 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# encoding: utf8
"""Provides `romanize()` for romanizing simple Japanese text.
Also provides available romanizers in a dictionary keyed by language identifier.
"""
class Romanizer(object):
def __init__(self, parent=None, **tables):
"""Create a Romanizer
parent: A LookupTables to base this one on
tables: Dicts that become the object's attributes. If a parent is given,
its tables are used, and updated with the given ones
"""
self.parent = parent
if parent:
self.tables = parent.tables
for name, table in tables.items():
# Take a copy -- don't want to clobber the parent's tables
self.tables[name] = dict(self.tables[name])
self.tables[name].update(table)
else:
self.tables = tables
for name, table in self.tables.items():
setattr(self, name, table)
def romanize(self, string):
"""Convert a string of kana to roomaji."""
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
characters = []
last_kana = None # Used for ー; っ or ッ; ん or ン
last_char = None # Used for small kana combos
for char in string:
# Full-width Latin
if 0xff01 <= ord(char) <= 0xff5e:
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot precede Latin characters.")
# XXX Real Unicode decomposition would be nicer
char = chr(ord(char) - 0xff01 + 0x21)
characters.append(char)
last_kana = None
# Small vowel kana
elif char in self.roomaji_small_kana:
combo = last_char + char
if combo in self.roomaji_small_kana_combos:
characters[-1] = self.roomaji_small_kana_combos[combo]
else:
# If we don't know what it is... act dumb and treat it as a
# full-size vowel. Better than bailing, and seems to occur a
# lot, e.g. ピィ is "pii"
characters.append(self.roomaji_small_kana[char])
last_kana = self.roomaji_small_kana[char]
# Youon
elif char in self.roomaji_youon:
if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
raise ValueError("Youon must follow an -i sound.")
# Drop the -i and append the ya/yu/yo sound
new_sound = self.roomaji_youon[char]
if last_kana in self.y_drop:
# Strip the y-
new_char = self.y_drop[last_kana] + new_sound[1:]
else:
new_char = last_kana[:-1] + new_sound
characters[-1] = new_char
last_kana = new_char
# Sokuon
elif char in (u'', u''):
# Remember it and double the consonant next time around
last_kana = 'sokuon'
# Extended vowel or n
elif char == u'':
if last_kana[-1] not in vowels:
raise ValueError(u"'' must follow by a vowel.")
if last_kana[-1] in self.lengthened_vowels:
characters[-1] = characters[-1][:-1]
characters.append(self.lengthened_vowels[last_kana[-1]])
else:
characters.append(last_kana[-1])
last_kana = None
# Regular ol' kana
elif char in self.roomaji_kana:
kana = self.roomaji_kana[char]
if last_kana == 'sokuon':
if kana[0] in vowels:
characters.append("'")
else:
characters.append(kana[0])
elif last_kana == 'n' and kana[0] in vowels:
characters.append("'")
# Special characters fo doubled kana
if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
kana = self.lengthened_vowels[kana[0]]
characters[-1] = characters[-1][:-1]
characters.append(kana)
last_kana = kana
# Not Japanese?
else:
if last_kana == 'sokuon':
raise ValueError("Sokuon must be followed by another kana.")
characters.append(char)
last_kana = None
last_char = char
if last_kana == 'sokuon':
raise ValueError("Sokuon cannot be the last character.")
return u''.join(characters)
romanizers = dict()
romanizers['en'] = Romanizer(
roomaji_kana={
# Hiragana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
# Katakana
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
u'': 'ka', u'': 'ki', u'': 'ku', u'': 'ke', u'': 'ko',
u'': 'sa', u'': 'shi', u'': 'su', u'': 'se', u'': 'so',
u'': 'ta', u'': 'chi', u'': 'tsu', u'': 'te', u'': 'to',
u'': 'na', u'': 'ni', u'': 'nu', u'': 'ne', u'': 'no',
u'': 'ha', u'': 'hi', u'': 'fu', u'': 'he', u'': 'ho',
u'': 'ma', u'': 'mi', u'': 'mu', u'': 'me', u'': 'mo',
u'': 'ya', u'': 'yu', u'': 'yo',
u'': 'ra', u'': 'ri', u'': 'ru', u'': 're', u'': 'ro',
u'': 'wa', u'': 'wi', u'': 'we', u'': 'wo',
u'': 'n',
u'': 'ga', u'': 'gi', u'': 'gu', u'': 'ge', u'': 'go',
u'': 'za', u'': 'ji', u'': 'zu', u'': 'ze', u'': 'zo',
u'': 'da', u'': 'ji', u'': 'dzu', u'': 'de', u'': 'do',
u'': 'ba', u'': 'bi', u'': 'bu', u'': 'be', u'': 'bo',
u'': 'pa', u'': 'pi', u'': 'pu', u'': 'pe', u'': 'po',
u'': 'vu',
},
roomaji_youon={
# Hiragana
u'': 'ya', u'': 'yu', u'': 'yo',
# Katakana
u'': 'ya', u'': 'yu', u'': 'yo',
},
# XXX If romanize() ever handles hiragana, it will need to make sure that the
# preceding character was a katakana
# This does not include every small kana combination, but should include every
# one used in a Pokémon name. An exhaustive list would be.. very long
roomaji_small_kana={
u'': 'a', u'': 'i', u'': 'u', u'': 'e', u'': 'o',
},
roomaji_small_kana_combos={
# These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
# particularly weird, but it seems to be what GF intends
# Simple vowel replacement
u'ウィ': 'wi', u'ウゥ': 'wu', u'ウェ': 'we', u'ウォ': 'wo',
u'ヴァ': 'va', u'ヴィ': 'vi', u'ヴェ': 've', u'ヴォ': 'vo',
u'チェ': 'che',
u'シェ': 'she',
u'ジェ': 'je',
u'テァ': 'tha', u'ティ': 'ti', u'テゥ': 'thu', u'テェ': 'tye', u'テォ': 'tho',
u'デァ': 'dha', u'ディ': 'di', u'デゥ': 'dhu', u'デェ': 'dye', u'デォ': 'dho',
u'ファ': 'fa', u'フィ': 'fi', u'ホゥ': 'hu', u'フェ': 'fe', u'フォ': 'fo',
# Not so much
u'シィ': 'sy',
u'ミィ': 'my',
u'ビィ': 'by',
u'ピィ': 'py',
},
lengthened_vowels={},
y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
)
romanizers['cs'] = Romanizer(parent=romanizers['en'],
roomaji_kana={
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': u'jo',
u'': u'dži', u'': u'dži',
u'': u'ši', u'': u'či', u'': u'cu',
u'': u'ja', u'': u'ju', u'': 'jo',
u'': u'dži', u'': u'dži',
},
roomaji_youon={
u'': 'ja', u'': 'ju', u'': 'jo',
u'': 'ja', u'': 'ju', u'': 'jo',
},
roomaji_small_kana_combos={
u'チェ': u'če', u'シェ': u'še', u'ジェ': u'dže',
u'テェ': u'tje', u'デェ': u'dje',
u'シィ': u'', u'ミィ': u'', u'ビィ': u'', u'ピィ': u'',
},
lengthened_vowels={'a': u'á', 'e': u'é', 'i': u'í', 'o': u'ó', 'u': u'ú'},
y_drop={u'či': u'č', u'ši': u'š', u'dži': u'', u'ni': u'ňj'},
)
def romanize(string, lang='en'):
"""Convert a string of kana to roomaji."""
# Get the correct romanizer; fall back to English
romanizer = romanizers.get(lang, 'en')
# Romanize away!
return romanizer.romanize(string)