mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
115 lines
4 KiB
Python
115 lines
4 KiB
Python
def merge_japanese_texts(kanji, kana, html=False):
|
||
"""Combine a (presuambly equivalent) pair of kanji and kana strings into a
|
||
single string of kanji with furigana.
|
||
|
||
If `html` is truthy, the return value will contain HTML ruby tags;
|
||
otherwise it will use the Unicode "interlinear annotation" characters.
|
||
|
||
This relies on the Needleman–Wunsch algorithm for sequence alignment:
|
||
https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
|
||
"""
|
||
# TODO maybe this is faster, but then -1 doesn't work
|
||
#table = [
|
||
# [None for _ in range(len(kana))]
|
||
# for _ in range(len(kanji))
|
||
#]
|
||
table = {}
|
||
# continue left, continue up, are the characters equivalent, score for this
|
||
# cell
|
||
table[-1, -1] = False, False, True, 0
|
||
|
||
isjunk = {}
|
||
for ch in kanji + kana:
|
||
isjunk[ch] = ch.isspace() or ch in '。'
|
||
|
||
# initialize, TODO, something about scoring compared to a gap
|
||
for i, ch in enumerate(kanji):
|
||
table[i, -1] = True, False, False, -1 - i
|
||
for i, ch in enumerate(kana):
|
||
table[-1, i] = False, True, False, -1 - i
|
||
for a, ach in enumerate(kanji):
|
||
for b, bch in enumerate(kana):
|
||
options = []
|
||
# Continue diagonally means two characters together, either a match
|
||
# or a mismatch
|
||
if ach == bch or (isjunk[ach] and isjunk[bch]):
|
||
equiv = True
|
||
score = 1
|
||
else:
|
||
equiv = False
|
||
score = -1
|
||
options.append((True, True, equiv, table[a - 1, b - 1][2] + score))
|
||
|
||
# Continue from or side means an indel... -1
|
||
if isjunk[ach]:
|
||
score = 0
|
||
else:
|
||
score = -1
|
||
options.append((True, False, equiv, table[a - 1, b][2] + score))
|
||
if isjunk[bch]:
|
||
score = 0
|
||
else:
|
||
score = -1
|
||
options.append((False, True, equiv, table[a, b - 1][2] + score))
|
||
|
||
# Strictly speaking, in the case of a tie, all of the "best"
|
||
# choices are supposed to be preserved. But we should never have a
|
||
# tie, and we have an arbitrary choice of which to use in the end
|
||
# anyway, so screw it.
|
||
table[a, b] = max(options, key=lambda opt: opt[2])
|
||
|
||
if html:
|
||
ruby_format = "<ruby><rb>{}</rb><rt>{}</rt></ruby>"
|
||
else:
|
||
ruby_format = "\ufff9{}\ufffa{}\ufffb"
|
||
|
||
def add_mismatches(mismatch_a, mismatch_b, final):
|
||
# Need to pop out any extra junk characters at the beginning or end --
|
||
# but only the kanji ones stay, since kanji is "canonical"
|
||
while mismatch_a and isjunk[mismatch_a[0]]:
|
||
final.append(mismatch_a.pop(0))
|
||
while mismatch_b and isjunk[mismatch_b[0]]:
|
||
mismatch_b.pop(0)
|
||
endjunk = []
|
||
while mismatch_a and isjunk[mismatch_a[-1]]:
|
||
endjunk.append(mismatch_a.pop())
|
||
while mismatch_b and isjunk[mismatch_b[-1]]:
|
||
mismatch_b.pop()
|
||
final.append(ruby_format.format(
|
||
''.join(reversed(mismatch_a)),
|
||
''.join(reversed(mismatch_b)),
|
||
))
|
||
final.extend(endjunk)
|
||
del mismatch_a[:]
|
||
del mismatch_b[:]
|
||
|
||
final = []
|
||
mismatch_a = []
|
||
mismatch_b = []
|
||
a = len(kanji) - 1
|
||
b = len(kana) - 1
|
||
while True:
|
||
walk_left, walk_up, equiv, score = table[a, b]
|
||
if walk_left and walk_up:
|
||
if equiv:
|
||
if mismatch_a or mismatch_b:
|
||
add_mismatches(mismatch_a, mismatch_b, final)
|
||
final.append(kanji[a])
|
||
else:
|
||
mismatch_a.append(kanji[a])
|
||
mismatch_b.append(kana[b])
|
||
a -= 1
|
||
b -= 1
|
||
elif walk_left:
|
||
mismatch_a.append(kanji[a])
|
||
a -= 1
|
||
elif walk_up:
|
||
mismatch_b.append(kana[b])
|
||
b -= 1
|
||
else:
|
||
break
|
||
|
||
if mismatch_a or mismatch_b:
|
||
add_mismatches(mismatch_a, mismatch_b, final)
|
||
|
||
return ''.join(reversed(final))
|