Use YAML schema for gen 6/7; add gen7 form names; improved image support

Specifically:

- Add support for detecting FLIM format

- Add support for more color formats

- Add a small decoded image type that knows how to write itself out as
  a PNG

- Improve ETC1 decoder to work with images whose dimensions are not
  powers of two, images with no alpha channel, and images with the
  strange FLIM pixel order

- Port the gen 6/7 extractor to Construct 2.8

- Switch to using script tags in language names, to distinguish Japanese
  kana from kanji and Simplified from Traditional Chinese

- Drop the load-time merging of kanji and kana

- Add paths to various text files in SUMO

- Add form names for SUMO Pokémon

- Clean up identifiers a bit, especially the distinction between species
  and Pokémon

- Use the Pokémon schema type to dump what we have so far, and give it a
  couple more fields that didn't exist in gen 1

- Get movesets dumping correctly

- Special-case a bunch of weirdness, where the number of dex sprites
  doesn't match the number of models in SUMO
This commit is contained in:
Eevee (Lexy Munroe) 2017-01-05 04:57:05 -08:00
parent 0f79a5b922
commit 053f2a8d22
6 changed files with 846 additions and 375 deletions

View file

@ -1,3 +1,5 @@
import io
import itertools
import math import math
import struct import struct
@ -17,7 +19,7 @@ imag_header_struct = c.Struct(
'section_length' / c.Const(c.Int32ul, 0x10), 'section_length' / c.Const(c.Int32ul, 0x10),
'width' / c.Int16ul, 'width' / c.Int16ul,
'height' / c.Int16ul, 'height' / c.Int16ul,
'format' / c.Int32ul, #'format' / c.Int32ul,
# TODO this seems to have been expanded into several things in SUMO # TODO this seems to have been expanded into several things in SUMO
#c.Enum( #c.Enum(
# c.ULInt32('format'), # c.ULInt32('format'),
@ -37,13 +39,35 @@ imag_header_struct = c.Struct(
# A4=13, # A4=13,
# #ETC1=19, # #ETC1=19,
#) #)
'unknown' / c.Int16ul,
'format' / c.Enum(
c.Int8ul,
L8=0,
A8=1,
LA4=2,
LA8=3,
HILO8=4,
RGB565=5,
RGB8=6,
RGBA5551=7,
RGBA4=8,
RGBA8=9,
ETC1=10,
ETC1A4=11,
L4=12,
A4=13,
#ETC1=19,
),
# RGB565=5,
# ETC1A4=11,
'unknown2' / c.Int8ul,
) )
# TODO probably move these to their own module, since they aren't just for # TODO probably move these to their own module, since they aren't just for
# CLIM. pixel deshuffler, too. (which should probably spit out pypng's native # CLIM. pixel deshuffler, too. (which should probably spit out pypng's native
# format) # format)
COLOR_DECODERS = {} COLOR_FORMATS = {}
@attr.s @attr.s
@ -59,22 +83,68 @@ class ColorFormat:
def __iter__(self): def __iter__(self):
# TODO back compat until i fix the below code # TODO back compat until i fix the below code
return iter((self.decoder, self.bits_per_pixel, self.bit_depth)) return iter((self, self.bits_per_pixel, self.bit_depth))
def _register_color_decoder(name, *, bpp, depth, alpha): def _register_color_decoder(name, *, bpp, depth, alpha):
def register(f): def register(f):
COLOR_DECODERS[name] = ColorFormat(name, f, bpp, depth, alpha) COLOR_FORMATS[name] = ColorFormat(name, f, bpp, depth, alpha)
return f return f
return register return register
@_register_color_decoder('A4', bpp=0.5, depth=4, alpha=True)
def decode_A4(data):
for a in data:
a0 = a & 0xf
a0 = (a0 << 4) | (a0 << 0)
a1 = a >> 4
a1 = (a1 << 4) | (a1 << 0)
yield 0, 0, 0, a0
yield 0, 0, 0, a1
@_register_color_decoder('A8', bpp=1, depth=8, alpha=True)
def decode_a8(data):
for a in data:
yield 0, 0, 0, a
@_register_color_decoder('L4', bpp=0.5, depth=4, alpha=False)
def decode_l4(data):
for l in data:
l0 = l & 0xf
l0 = (l0 << 4) | (l0 << 0)
l1 = l >> 4
l1 = (l1 << 4) | (l1 << 0)
yield l0, l0, l0
yield l1, l1, l1
@_register_color_decoder('L8', bpp=1, depth=8, alpha=False) @_register_color_decoder('L8', bpp=1, depth=8, alpha=False)
def decode_l8(data): def decode_l8(data):
for l in data: for l in data:
yield l, l, l yield l, l, l
@_register_color_decoder('LA4', bpp=1, depth=4, alpha=True)
def decode_la4(data):
for la in data:
l = la >> 4
l = (l << 4) | (l << 0)
a = (la >> 0) & 0xf
a = (a << 4) | (a << 4)
yield l, l, l, a
@_register_color_decoder('LA8', bpp=2, depth=8, alpha=True)
def decode_la8(data):
for i in range(0, len(data), 2):
a = data[i]
l = data[i + 1]
yield l, l, l, a
@_register_color_decoder('RGBA4', bpp=2, depth=4, alpha=True) @_register_color_decoder('RGBA4', bpp=2, depth=4, alpha=True)
def decode_rgba4(data): def decode_rgba4(data):
# The idea is that every uint16 is a packed rrrrggggbbbbaaaa, but when # The idea is that every uint16 is a packed rrrrggggbbbbaaaa, but when
@ -93,15 +163,16 @@ def decode_rgba4(data):
@_register_color_decoder('RGB8', bpp=3, depth=8, alpha=False) @_register_color_decoder('RGB8', bpp=3, depth=8, alpha=False)
def decode_rgb8(data): def decode_rgb8(data):
for i in range(0, len(data), 3): for i in range(0, len(data), 3):
yield data[i:i + 3] yield data[i:i + 3][::-1]
@_register_color_decoder('RGBA8', bpp=4, depth=8, alpha=True) @_register_color_decoder('RGBA8', bpp=4, depth=8, alpha=True)
def decode_rgba8(data): def decode_rgba8(data):
for i in range(0, len(data), 4): for i in range(0, len(data), 4):
yield data[i:i + 4] yield data[i:i + 4][::-1]
# FIXME turns out the above just are these, so, ditch these
@_register_color_decoder('BGR8', bpp=3, depth=8, alpha=False) @_register_color_decoder('BGR8', bpp=3, depth=8, alpha=False)
def decode_bgr8(data): def decode_bgr8(data):
for i in range(0, len(data), 3): for i in range(0, len(data), 3):
@ -125,6 +196,7 @@ def decode_rgba5551(data, *, start=0, count=None):
for i in range(start, end, 2): for i in range(start, end, 2):
datum = data[i] + data[i + 1] * 256 datum = data[i] + data[i + 1] * 256
# FIXME repeat rather than doing division
r = (((datum >> 11) & 0x1f) * 255 + 15) // 31 r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
g = (((datum >> 6) & 0x1f) * 255 + 15) // 31 g = (((datum >> 6) & 0x1f) * 255 + 15) // 31
b = (((datum >> 1) & 0x1f) * 255 + 15) // 31 b = (((datum >> 1) & 0x1f) * 255 + 15) // 31
@ -132,6 +204,45 @@ def decode_rgba5551(data, *, start=0, count=None):
yield r, g, b, a yield r, g, b, a
@_register_color_decoder('RGB565', bpp=2, depth=5, alpha=False)
def decode_rgb565(data, *, start=0, count=None):
# FIXME i bet construct totally /can/ parse this mess for me
if count is None:
end = len(data)
else:
end = start + count * 2
for i in range(start, end, 2):
datum = data[i] + data[i + 1] * 256
# FIXME repeat rather than doing division
r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
g = (((datum >> 5) & 0x3f) * 255 + 31) // 63
b = (((datum >> 0) & 0x1f) * 255 + 15) // 31
yield r, g, b
@_register_color_decoder('RGB332', bpp=1, depth=2, alpha=False)
def decode_rgb332(data, *, start=0, count=None):
if count is None:
end = len(data)
else:
end = start + count
for i in range(start, end):
datum = data[i]
r = (datum >> 5) & 0x7
r = (r << 5) | (r << 2) | (r >> 1)
g = (datum >> 2) & 0x7
g = (g << 5) | (g << 2) | (g >> 1)
b = (datum >> 0) & 0x7
b = (b << 5) | (b << 2) | (b >> 1)
yield r, g, b
_register_color_decoder('ETC1', bpp=0.5, depth=4, alpha=False)(None)
_register_color_decoder('ETC1A4', bpp=1, depth=4, alpha=True)(None)
del _register_color_decoder del _register_color_decoder
@ -156,12 +267,24 @@ def untile_pixels(raw_pixels, width, height, *, is_flim):
Taken from: https://github.com/Zhorken/pokemon-x-y-icons/ Taken from: https://github.com/Zhorken/pokemon-x-y-icons/
""" """
# FIXME this is a wild guess, because i've seen a 4x4 image that this just
# doesn't handle correctly, but the image is all white so i have no idea
# what the right fix is -- there's a 4 x 0x78 in 0/7/9 though...
if width < 8 or height < 8:
pixels = []
it = iter(raw_pixels)
for r in range(height):
pixels.append([])
for c in range(width):
pixels[-1].append(next(it))
return pixels
# Images are stored padded to powers of two # Images are stored padded to powers of two
stored_width = 2 ** math.ceil(math.log(width) / math.log(2)) stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
stored_height = 2 ** math.ceil(math.log(height) / math.log(2)) stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
num_pixels = stored_width * stored_height num_pixels = stored_width * stored_height
tile_width = stored_width // 8 tile_width = (stored_width + 7) // 8
tile_height = stored_height // 8 tile_height = (stored_height + 7) // 8
pixels = [ pixels = [
[None for x in range(width)] [None for x in range(width)]
@ -175,6 +298,7 @@ def untile_pixels(raw_pixels, width, height, *, is_flim):
# Find the coordinates of the top-left corner of the current tile. # Find the coordinates of the top-left corner of the current tile.
# n.b. The image is eight tiles wide, and each tile is 8×8 pixels. # n.b. The image is eight tiles wide, and each tile is 8×8 pixels.
tile_num = n // 64 tile_num = n // 64
# FIXME i found a 4x4 FLIM that this fails for???
if is_flim: if is_flim:
# The FLIM format seems to pseudo-rotate the entire image to the # The FLIM format seems to pseudo-rotate the entire image to the
# right, so tiles start in the bottom left and go up # right, so tiles start in the bottom left and go up
@ -224,25 +348,38 @@ def decode_clim(data):
raise ValueError("Unknown image format {}".format(file_format)) raise ValueError("Unknown image format {}".format(file_format))
imag_header = imag_header_struct.parse(data[-20:]) imag_header = imag_header_struct.parse(data[-20:])
if is_flim: #if is_flim:
# TODO SUMO hack; not sure how to get format out of this header # # TODO SUMO hack; not sure how to get format out of this header
imag_header.format = 'RGBA5551' # imag_header.format = 'RGBA5551'
if imag_header.format not in COLOR_DECODERS: if imag_header.format not in COLOR_FORMATS:
raise ValueError( raise ValueError(
"don't know how to decode {} pixels".format(imag_header.format)) "don't know how to decode {} pixels".format(imag_header.format))
color_decoder, color_bpp, color_depth = COLOR_DECODERS[imag_header.format] color_format = COLOR_FORMATS[imag_header.format]
mode, = struct.unpack_from('<H', data, 0) mode, = struct.unpack_from('<H', data, 0)
if mode == 2: if mode == 2:
# Paletted # Paletted
palette_length, = struct.unpack_from('<H', data, 2) palette_length, = struct.unpack_from('<H', data, 2)
palette = list(color_decoder(data, start=4, count=palette_length)) palette = list(color_format.decoder(data, start=4, count=palette_length))
data_start = 4 + palette_length * color_bpp data_start = 4 + palette_length * color_format.bits_per_pixel
scrambled_pixels = uncuddle_paletted_pixels(palette, data[data_start:]) scrambled_pixels = uncuddle_paletted_pixels(palette, data[data_start:])
elif imag_header.format == 'ETC1':
# FIXME merge this decoder in (problem is it needs to know width +
# height -- maybe i can move the pixel unscrambling out of it somehow?)
from .etc1 import decode_etc1
pixels = decode_etc1(b'\x00' * 0x80 + data, imag_header.width, imag_header.height, use_alpha=False, is_flim=True)[4]
return DecodedImageData(
imag_header.width, imag_header.height, color_format, None, pixels)
elif imag_header.format == 'ETC1A4':
# FIXME same
from .etc1 import decode_etc1
pixels = decode_etc1(b'\x00' * 0x80 + data, imag_header.width, imag_header.height, is_flim=True)[4]
return DecodedImageData(
imag_header.width, imag_header.height, color_format, None, pixels)
else: else:
palette = None palette = None
scrambled_pixels = color_decoder(data) scrambled_pixels = color_format.decoder(data)
pixels = untile_pixels( pixels = untile_pixels(
scrambled_pixels, scrambled_pixels,
@ -250,4 +387,55 @@ def decode_clim(data):
imag_header.height, imag_header.height,
is_flim=is_flim, is_flim=is_flim,
) )
return imag_header.width, imag_header.height, color_depth, palette, pixels return DecodedImageData(
imag_header.width, imag_header.height, color_format, palette, pixels)
class DecodedImageData:
def __init__(self, width, height, color_format, palette, pixels):
self.width = width
self.height = height
self.color_format = color_format
self.palette = palette
self.pixels = pixels
def __iter__(self):
return iter((self.width, self.height, self.color_format.bit_depth, self.palette, self.pixels))
def mirror(self):
for row in self.pixels:
row.reverse()
def write_to_png(self, f):
"""Write the results of ``decode_clim`` to a file object."""
import png
writer_kwargs = dict(width=self.width, height=self.height)
if self.palette:
writer_kwargs['palette'] = self.palette
if self.color_format.alpha:
# TODO do i really only need alpha=True if there's no palette?
writer_kwargs['alpha'] = True
writer = png.Writer(**writer_kwargs)
# For a paletted image, I want to preserve Zhorken's good idea of
# indicating the original bit depth with an sBIT chunk. But PyPNG can't do
# that directly, so instead I have to do some nonsense.
# FIXME should probably just do that for everything?
if self.palette:
buf = io.BytesIO()
writer.write(buf, self.pixels)
# Read the PNG as chunks, and manually add an sBIT chunk
buf.seek(0)
png_reader = png.Reader(buf)
chunks = list(png_reader.chunks())
sbit = bytes([self.color_format.bit_depth] * 3)
chunks.insert(1, ('sBIT', sbit))
# Now write the chunks to the file
png.write_chunks(f, chunks)
else:
# Otherwise, it's... almost straightforward.
writer.write(f, (itertools.chain(*row) for row in self.pixels))

View file

@ -8,6 +8,7 @@ that decodes four 4x4 blocks one 8x8 block at a time, because of course it is.
(I believe the 3DS operates with 8x8 tiles, so this does make some sense.) (I believe the 3DS operates with 8x8 tiles, so this does make some sense.)
""" """
import io import io
import math
# Easier than doing math # Easier than doing math
THREE_BIT_TWOS_COMPLEMENT = [0, 1, 2, 3, -4, -3, -2, -1] THREE_BIT_TWOS_COMPLEMENT = [0, 1, 2, 3, -4, -3, -2, -1]
@ -40,28 +41,34 @@ def clamp_to_byte(n):
return max(0, min(255, n)) return max(0, min(255, n))
def decode_etc1(data): # FIXME sizes are hardcoded here
# TODO sizes are hardcoded here def decode_etc1(data, width=128, height=128, use_alpha=True, is_flim=False):
width = 128
height = 128
# TODO this seems a little redundant; could just ask for a stream # TODO this seems a little redundant; could just ask for a stream
f = io.BytesIO(data) f = io.BytesIO(data)
# Skip header # Skip header
f.read(0x80) f.read(0x80)
outpixels = [[None] * width for _ in range(height)] # Images are stored padded to powers of two
stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
outpixels = [[None] * (width) for _ in range(height)]
# ETC1 encodes as 4x4 blocks. Normal ETC1 arranges them in English reading # ETC1 encodes as 4x4 blocks. Normal ETC1 arranges them in English reading
# order, right and down. This Nintendo variant groups them as 8x8 # order, right and down. This Nintendo variant groups them as 8x8
# superblocks, where the four blocks in each superblock are themselves # superblocks, where the four blocks in each superblock are themselves
# arranged right and down. So we read block offsets 8 at a time, and 'z' # arranged right and down. So we read block offsets 8 at a time, and 'z'
# is our current position within a superblock. # is our current position within a superblock.
# TODO this may do the wrong thing if width/height is not divisible by 8 # TODO this may do the wrong thing if width/height is not divisible by 8
for blocky in range(0, height, 8): for blocky in range(0, stored_height, 8):
for blockx in range(0, width, 8): for blockx in range(0, stored_width, 8):
for z in range(4): for z in range(4):
row = f.read(16) if use_alpha:
if not row: row = f.read(16)
else:
# FIXME this could sure be incorporated better
row = b'\xff' * 8 + f.read(8)
if len(row) < 16:
print(row, blocky, blockx, z, f.tell() - 0x80, len(data) - 0x80)
raise EOFError raise EOFError
# Each block is encoded as 16 bytes. The first 8 are a 4-bit # Each block is encoded as 16 bytes. The first 8 are a 4-bit
@ -126,16 +133,28 @@ def decode_etc1(data):
base1 = red1, green1, blue1 base1 = red1, green1, blue1
base2 = red2, green2, blue2 base2 = red2, green2, blue2
# FLIM images do this truly bizarre thing where they write out the columns, as rows
if is_flim:
block = (blocky // 8) * (stored_width // 8) + (blockx // 8)
x0 = block // (stored_height // 8) * 8 + z // 2 * 4
y0 = block % (stored_height // 8) * 8 + z % 2 * 4
else:
x0 = blockx + z % 2 * 4
y0 = blocky + z // 2 * 4
# Now deal with individual pixels # Now deal with individual pixels
it = iter_alpha_nybbles(alpha) it = iter_alpha_nybbles(alpha)
for c in range(4): for c in range(4):
for r in range(4): for r in range(4):
x = blockx + c if is_flim:
y = blocky + r x = x0 + r
if z in (1, 3): y = y0 + c
x += 4 else:
if z in (2, 3): x = x0 + c
y += 4 y = y0 + r
if not (x < width and y < height):
continue
if (flipbit and r < 2) or (not flipbit and c < 2): if (flipbit and r < 2) or (not flipbit and c < 2):
table = table1 table = table1
@ -149,8 +168,11 @@ def decode_etc1(data):
lobit = (lopixelbits >> pixelbit) & 0x1 lobit = (lopixelbits >> pixelbit) & 0x1
mod = table[hibit * 2 + lobit] mod = table[hibit * 2 + lobit]
color = tuple(clamp_to_byte(b + mod) for b in base) color = tuple(clamp_to_byte(b + mod) for b in base)
color += (next(it),) if use_alpha:
color += (next(it),)
outpixels[y][x] = color outpixels[y][x] = color
# 4 is the bit depth; None is the palette # 4 is the bit depth; None is the palette
return width, height, 4, None, outpixels from .clim import DecodedImageData, COLOR_FORMATS
# FIXME stupid import, wrong color format
return DecodedImageData(width, height, COLOR_FORMATS['ETC1A4'], None, outpixels)

View file

@ -94,13 +94,16 @@ class GARCEntry(object):
def __getitem__(self, i): def __getitem__(self, i):
start, length = self.slices[i] start, length = self.slices[i]
ss = self.stream.slice(start, length) ss = self.stream.slice(start, length)
if ss.peek(1) in [b'\x10', b'\x11']: if ss.peek(1) in b'\x10\x11':
# XXX this sucks but there's no real way to know for sure whether # XXX this sucks but there's no real way to know for sure whether
# data is compressed or not. maybe just bake this into the caller # data is compressed or not. maybe just bake this into the caller
# and let them deal with it, same way we do with text decoding? # and let them deal with it, same way we do with text decoding?
# TODO it would be nice if this could be done lazily for 'inspect' # TODO it would be nice if this could be done lazily for 'inspect'
# purposes, since the first four bytes are enough to tell you the # purposes, since the first four bytes are enough to tell you the
# size # size
# FIXME make this work even for red herrings, maybe by finishing it
# up and doing a trial decompression of the first x bytes
#return CompressedStream(ss)
try: try:
data = lzss3.decompress_bytes(ss.read()) data = lzss3.decompress_bytes(ss.read())
except Exception: except Exception:
@ -113,6 +116,47 @@ class GARCEntry(object):
return len(self.slices) return len(self.slices)
class CompressedStream:
def __init__(self, stream):
self.stream = stream
header = stream.read(4)
stream.seek(0)
assert header[0] in b'\x10\x11'
self.length, = struct.unpack('<L', header[1:] + b'\x00')
self.data = None
def __len__(self):
return self.length
def _decompress(self):
self.data = BytesIO(lzss3.decompress_bytes(self.stream.read()))
def read(self, *args):
if self.data is None:
self._decompress()
return self.data.read(*args)
def seek(self, *args):
if self.data is None:
self._decompress()
return self.data.seek(*args)
def tell(self, *args):
if self.data is None:
self._decompress()
return self.data.tell(*args)
def peek(self, n):
if self.data is None:
self._decompress()
here = self.data.tell()
ret = self.data.read(n)
self.data.seek(here)
return ret
XY_CHAR_MAP = { XY_CHAR_MAP = {
0x307f: 0x202f, # nbsp 0x307f: 0x202f, # nbsp
0xe08d: 0x2026, # ellipsis 0xe08d: 0x2026, # ellipsis
@ -360,7 +404,7 @@ def do_inspect(args):
else: else:
print() print()
cutoff = max(total_subfiles // 10, 2) cutoff = max(total_subfiles // 10, 1)
for magic, ct in magic_ctr.most_common(): for magic, ct in magic_ctr.most_common():
if ct < cutoff: if ct < cutoff:
break break

View file

@ -11,7 +11,7 @@ class PokemonContainerFile(_ContainerFile):
self.stream = stream = Substream(stream) self.stream = stream = Substream(stream)
magic, entry_ct = stream.unpack('<2sH') magic, entry_ct = stream.unpack('<2sH')
assert magic == b'PC' assert magic in (b'PC', b'PS', b'BL')
# Offsets are "A B C ...", where entry 0 ranges from A to B, entry 1 # Offsets are "A B C ...", where entry 0 ranges from A to B, entry 1
# from B to C, etc. # from B to C, etc.

File diff suppressed because it is too large Load diff

View file

@ -164,6 +164,8 @@ Evolution = _ForwardDeclaration()
EncounterMap = _ForwardDeclaration() EncounterMap = _ForwardDeclaration()
MoveSet = _ForwardDeclaration() MoveSet = _ForwardDeclaration()
Pokedex = _ForwardDeclaration() Pokedex = _ForwardDeclaration()
Item = _ForwardDeclaration()
Ability = _ForwardDeclaration()
class Pokémon(VersionedLocus): class Pokémon(VersionedLocus):
@ -173,6 +175,9 @@ class Pokémon(VersionedLocus):
base_stats = _Map(Stat, int) base_stats = _Map(Stat, int)
growth_rate = _Value(GrowthRate) growth_rate = _Value(GrowthRate)
base_experience = _Value(int, min=0, max=255) base_experience = _Value(int, min=0, max=255)
capture_rate = _Value(int, min=0, max=255)
held_items = _Map(Item, int)
gender_rate = _Value(int)
pokedex_numbers = _Map(Pokedex, int) pokedex_numbers = _Map(Pokedex, int)
@ -202,9 +207,25 @@ class Pokémon(VersionedLocus):
# TODO should this be written in hex, maybe? # TODO should this be written in hex, maybe?
game_index = _Value(int) game_index = _Value(int)
# FIXME how do i distinguish hidden ability?
abilities = _List(Ability)
Pokemon = Pokémon Pokemon = Pokémon
MoveEffect = _ForwardDeclaration()
class Move(VersionedLocus):
name = _Localized(str)
type = _Value(Type)
power = _Value(int)
pp = _Value(int)
accuracy = _Value(int)
effect = _Value(MoveEffect)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# The repository class, primary interface to the data # The repository class, primary interface to the data