veekun_pokedex/pokedex/extract/lib/text.py

116 lines
4 KiB
Python
Raw Permalink Normal View History

2016-02-26 18:05:51 +00:00
def merge_japanese_texts(kanji, kana, html=False):
"""Combine a (presuambly equivalent) pair of kanji and kana strings into a
single string of kanji with furigana.
If `html` is truthy, the return value will contain HTML ruby tags;
otherwise it will use the Unicode "interlinear annotation" characters.
This relies on the NeedlemanWunsch algorithm for sequence alignment:
https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
"""
# TODO maybe this is faster, but then -1 doesn't work
#table = [
# [None for _ in range(len(kana))]
# for _ in range(len(kanji))
#]
table = {}
# continue left, continue up, are the characters equivalent, score for this
# cell
table[-1, -1] = False, False, True, 0
isjunk = {}
for ch in kanji + kana:
isjunk[ch] = ch.isspace() or ch in '。␤'
# initialize, TODO, something about scoring compared to a gap
for i, ch in enumerate(kanji):
table[i, -1] = True, False, False, -1 - i
for i, ch in enumerate(kana):
table[-1, i] = False, True, False, -1 - i
for a, ach in enumerate(kanji):
for b, bch in enumerate(kana):
options = []
# Continue diagonally means two characters together, either a match
# or a mismatch
if ach == bch or (isjunk[ach] and isjunk[bch]):
equiv = True
score = 1
else:
equiv = False
score = -1
options.append((True, True, equiv, table[a - 1, b - 1][2] + score))
# Continue from or side means an indel... -1
if isjunk[ach]:
score = 0
else:
score = -1
options.append((True, False, equiv, table[a - 1, b][2] + score))
if isjunk[bch]:
score = 0
else:
score = -1
options.append((False, True, equiv, table[a, b - 1][2] + score))
# Strictly speaking, in the case of a tie, all of the "best"
# choices are supposed to be preserved. But we should never have a
# tie, and we have an arbitrary choice of which to use in the end
# anyway, so screw it.
table[a, b] = max(options, key=lambda opt: opt[2])
if html:
ruby_format = "<ruby><rb>{}</rb><rt>{}</rt></ruby>"
else:
ruby_format = "\ufff9{}\ufffa{}\ufffb"
def add_mismatches(mismatch_a, mismatch_b, final):
# Need to pop out any extra junk characters at the beginning or end --
# but only the kanji ones stay, since kanji is "canonical"
while mismatch_a and isjunk[mismatch_a[0]]:
final.append(mismatch_a.pop(0))
while mismatch_b and isjunk[mismatch_b[0]]:
mismatch_b.pop(0)
endjunk = []
while mismatch_a and isjunk[mismatch_a[-1]]:
endjunk.append(mismatch_a.pop())
while mismatch_b and isjunk[mismatch_b[-1]]:
mismatch_b.pop()
final.append(ruby_format.format(
''.join(reversed(mismatch_a)),
''.join(reversed(mismatch_b)),
))
final.extend(endjunk)
del mismatch_a[:]
del mismatch_b[:]
final = []
mismatch_a = []
mismatch_b = []
a = len(kanji) - 1
b = len(kana) - 1
while True:
walk_left, walk_up, equiv, score = table[a, b]
if walk_left and walk_up:
if equiv:
if mismatch_a or mismatch_b:
add_mismatches(mismatch_a, mismatch_b, final)
final.append(kanji[a])
else:
mismatch_a.append(kanji[a])
mismatch_b.append(kana[b])
a -= 1
b -= 1
elif walk_left:
mismatch_a.append(kanji[a])
a -= 1
elif walk_up:
mismatch_b.append(kana[b])
b -= 1
else:
break
if mismatch_a or mismatch_b:
add_mismatches(mismatch_a, mismatch_b, final)
return ''.join(reversed(final))