mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
116 lines
4 KiB
Python
116 lines
4 KiB
Python
|
def merge_japanese_texts(kanji, kana, html=False):
|
|||
|
"""Combine a (presuambly equivalent) pair of kanji and kana strings into a
|
|||
|
single string of kanji with furigana.
|
|||
|
|
|||
|
If `html` is truthy, the return value will contain HTML ruby tags;
|
|||
|
otherwise it will use the Unicode "interlinear annotation" characters.
|
|||
|
|
|||
|
This relies on the Needleman–Wunsch algorithm for sequence alignment:
|
|||
|
https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
|
|||
|
"""
|
|||
|
# TODO maybe this is faster, but then -1 doesn't work
|
|||
|
#table = [
|
|||
|
# [None for _ in range(len(kana))]
|
|||
|
# for _ in range(len(kanji))
|
|||
|
#]
|
|||
|
table = {}
|
|||
|
# continue left, continue up, are the characters equivalent, score for this
|
|||
|
# cell
|
|||
|
table[-1, -1] = False, False, True, 0
|
|||
|
|
|||
|
isjunk = {}
|
|||
|
for ch in kanji + kana:
|
|||
|
isjunk[ch] = ch.isspace() or ch in '。'
|
|||
|
|
|||
|
# initialize, TODO, something about scoring compared to a gap
|
|||
|
for i, ch in enumerate(kanji):
|
|||
|
table[i, -1] = True, False, False, -1 - i
|
|||
|
for i, ch in enumerate(kana):
|
|||
|
table[-1, i] = False, True, False, -1 - i
|
|||
|
for a, ach in enumerate(kanji):
|
|||
|
for b, bch in enumerate(kana):
|
|||
|
options = []
|
|||
|
# Continue diagonally means two characters together, either a match
|
|||
|
# or a mismatch
|
|||
|
if ach == bch or (isjunk[ach] and isjunk[bch]):
|
|||
|
equiv = True
|
|||
|
score = 1
|
|||
|
else:
|
|||
|
equiv = False
|
|||
|
score = -1
|
|||
|
options.append((True, True, equiv, table[a - 1, b - 1][2] + score))
|
|||
|
|
|||
|
# Continue from or side means an indel... -1
|
|||
|
if isjunk[ach]:
|
|||
|
score = 0
|
|||
|
else:
|
|||
|
score = -1
|
|||
|
options.append((True, False, equiv, table[a - 1, b][2] + score))
|
|||
|
if isjunk[bch]:
|
|||
|
score = 0
|
|||
|
else:
|
|||
|
score = -1
|
|||
|
options.append((False, True, equiv, table[a, b - 1][2] + score))
|
|||
|
|
|||
|
# Strictly speaking, in the case of a tie, all of the "best"
|
|||
|
# choices are supposed to be preserved. But we should never have a
|
|||
|
# tie, and we have an arbitrary choice of which to use in the end
|
|||
|
# anyway, so screw it.
|
|||
|
table[a, b] = max(options, key=lambda opt: opt[2])
|
|||
|
|
|||
|
if html:
|
|||
|
ruby_format = "<ruby><rb>{}</rb><rt>{}</rt></ruby>"
|
|||
|
else:
|
|||
|
ruby_format = "\ufff9{}\ufffa{}\ufffb"
|
|||
|
|
|||
|
def add_mismatches(mismatch_a, mismatch_b, final):
|
|||
|
# Need to pop out any extra junk characters at the beginning or end --
|
|||
|
# but only the kanji ones stay, since kanji is "canonical"
|
|||
|
while mismatch_a and isjunk[mismatch_a[0]]:
|
|||
|
final.append(mismatch_a.pop(0))
|
|||
|
while mismatch_b and isjunk[mismatch_b[0]]:
|
|||
|
mismatch_b.pop(0)
|
|||
|
endjunk = []
|
|||
|
while mismatch_a and isjunk[mismatch_a[-1]]:
|
|||
|
endjunk.append(mismatch_a.pop())
|
|||
|
while mismatch_b and isjunk[mismatch_b[-1]]:
|
|||
|
mismatch_b.pop()
|
|||
|
final.append(ruby_format.format(
|
|||
|
''.join(reversed(mismatch_a)),
|
|||
|
''.join(reversed(mismatch_b)),
|
|||
|
))
|
|||
|
final.extend(endjunk)
|
|||
|
del mismatch_a[:]
|
|||
|
del mismatch_b[:]
|
|||
|
|
|||
|
final = []
|
|||
|
mismatch_a = []
|
|||
|
mismatch_b = []
|
|||
|
a = len(kanji) - 1
|
|||
|
b = len(kana) - 1
|
|||
|
while True:
|
|||
|
walk_left, walk_up, equiv, score = table[a, b]
|
|||
|
if walk_left and walk_up:
|
|||
|
if equiv:
|
|||
|
if mismatch_a or mismatch_b:
|
|||
|
add_mismatches(mismatch_a, mismatch_b, final)
|
|||
|
final.append(kanji[a])
|
|||
|
else:
|
|||
|
mismatch_a.append(kanji[a])
|
|||
|
mismatch_b.append(kana[b])
|
|||
|
a -= 1
|
|||
|
b -= 1
|
|||
|
elif walk_left:
|
|||
|
mismatch_a.append(kanji[a])
|
|||
|
a -= 1
|
|||
|
elif walk_up:
|
|||
|
mismatch_b.append(kana[b])
|
|||
|
b -= 1
|
|||
|
else:
|
|||
|
break
|
|||
|
|
|||
|
if mismatch_a or mismatch_b:
|
|||
|
add_mismatches(mismatch_a, mismatch_b, final)
|
|||
|
|
|||
|
return ''.join(reversed(final))
|