Initial gen6-to-yaml ripping stuff

2024-08-20 18:16:34 +00:00 · 2016-02-26 10:05:51 -08:00 · 2016-02-26 10:05:51 -08:00 · 949eafb957
commit 949eafb957
parent 54ea67a804
9 changed files with 1841 additions and 0 deletions
--- a/pokedex/extract/init.py
+++ b/pokedex/extract/init.py
--- a/pokedex/extract/lib/init.py
+++ b/pokedex/extract/lib/init.py
--- a/pokedex/extract/lib/base.py
+++ b/pokedex/extract/lib/base.py
@ -0,0 +1,87 @@
 """Base or helper classes used a lot for dealing with file formats.
 """
 import io
 import struct
 class Substream:
    """Wraps a stream and pretends it starts at an offset other than 0.
    Partly implements the file interface.
    This type always seeks before reading, but doesn't do so afterwards, so
    interleaving reads with the underlying stream may not do what you want.
    """
    def __init__(self, stream, offset=0, length=-1):
        if isinstance(stream, Substream):
            self.stream = stream.stream
            self.offset = offset + stream.offset
        else:
            self.stream = stream
            self.offset = offset
        self.length = length
        self.pos = 0
    def __repr__(self):
        return "<{} of {} at {}>".format(
            type(self).__name__, self.stream, self.offset)
    def read(self, n=-1):
        self.stream.seek(self.offset + self.pos)
        if n < 0:
            n = self.length
        elif self.length >= 0 and n > self.length:
            n = self.length
        data = self.stream.read(n)
        self.pos += len(data)
        return data
    def seek(self, offset):
        offset = max(offset, 0)
        if self.length >= 0:
            offset = min(offset, self.length)
        self.stream.seek(self.offset + offset)
        self.pos = self.tell()
    def tell(self):
        return self.stream.tell() - self.offset
    def __len__(self):
        if self.length < 0:
            pos = self.stream.tell()
            self.stream.seek(0, io.SEEK_END)
            parent_length = self.stream.tell()
            self.stream.seek(pos)
            return parent_length - self.offset
        else:
            return self.length
    def peek(self, n):
        pos = self.stream.tell()
        self.stream.seek(self.offset + self.pos)
        data = self.stream.read(n)
        self.stream.seek(pos)
        return data
    def unpack(self, fmt):
        """Unpacks a struct format from the current position in the stream."""
        data = self.read(struct.calcsize(fmt))
        return struct.unpack(fmt, data)
    def slice(self, offset, length=-1):
        # TODO limit or warn if length is too long for this slice?
        return Substream(self, self.offset + offset, length)
 class _ContainerFile:
    slices = ()
    def __len__(self):
        return len(self.slices)
    def __iter__(self):
        return iter(self.slices)
    def __getitem__(self, key):
        return self.slices[key]
--- a/pokedex/extract/lib/clim.py
+++ b/pokedex/extract/lib/clim.py
@ -0,0 +1,182 @@
 import math
 import struct
 import construct as c
 clim_header_struct = c.Struct(
    'clim_header',
    c.Magic(b'CLIM'),
    c.Const(c.ULInt16('endianness'), 0xfeff),
    c.Const(c.ULInt16('header_length'), 0x14),
    c.ULInt32('version'),
    c.ULInt32('file_size'),
    c.ULInt32('blocks_ct'),
 )
 imag_header_struct = c.Struct(
    'imag_header',
    c.Magic(b'imag'),
    c.Const(c.ULInt32('section_length'), 0x10),
    c.ULInt16('width'),
    c.ULInt16('height'),
    c.Enum(
        c.ULInt32('format'),
        L8=0,
        A8=1,
        LA4=2,
        LA8=3,
        HILO8=4,
        RGB565=5,
        RGB8=6,
        RGBA5551=7,
        RGBA4=8,
        RGBA8=9,
        ETC1=10,
        ETC1A4=11,
        L4=12,
        A4=13,
        #ETC1=19,
    )
 )
 COLOR_DECODERS = {}
 def _register_color_decoder(name, *, bpp, depth):
    def register(f):
        COLOR_DECODERS[name] = f, bpp, depth
        return f
    return register
@_register_color_decoder('RGBA4', bpp=2, depth=4)
 def decode_rgba4(data):
    # The idea is that every uint16 is a packed rrrrggggbbbbaaaa, but when
    # written out little-endian this becomes bbbbaaaarrrrgggg and there's just
    # no pretty way to deal with this
    for i in range(0, len(data), 2):
        ba = data[i]
        rg = data[i + 1]
        r = (((rg & 0xf0) >> 4) * 255 + 7) // 15
        g = (((rg & 0x0f) >> 0) * 255 + 7) // 15
        b = (((ba & 0xf0) >> 4) * 255 + 7) // 15
        a = (((ba & 0x0f) >> 0) * 255 + 7) // 15
        yield r, g, b, a
@_register_color_decoder('RGBA5551', bpp=2, depth=5)
 def decode_rgba5551(data, *, start=0, count=None):
    # I am extremely irritated that construct cannot parse this mess for me
    # rrrrrgggggbbbbba
    if count is None:
        end = len(data)
    else:
        end = start + count * 2
    for i in range(start, end, 2):
        datum = data[i] + data[i + 1] * 256
        r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
        g = (((datum >> 6) & 0x1f) * 255 + 15) // 31
        b = (((datum >> 1) & 0x1f) * 255 + 15) // 31
        a = (datum & 0x1) * 255
        yield r, g, b, a
 del _register_color_decoder
 def apply_palette(palette, data, *, start=0):
    # TODO i am annoyed that this does a pointless copy, but i assume islice()
    # has even more overhead...
    if start != 0:
        data = data[start:]
    if len(palette) <= 16:
        # Short palettes allow cramming two pixels into each byte
        return (
            palette[idx]
            for byte in data
            for idx in (byte >> 4, byte & 0x0f)
        )
    else:
        return map(palette.__getitem__, data)
 def untile_pixels(raw_pixels, width, height):
    """Unscramble pixels into plain old rows.
    The pixels are arranged in 8×8 tiles, and each tile is a third-
    iteration Z-order curve.
    Taken from: https://github.com/Zhorken/pokemon-x-y-icons/
    """
    # Images are stored padded to powers of two
    stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
    stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
    num_pixels = stored_width * stored_height
    tile_width = stored_width // 8
    pixels = [
        [None for x in range(width)]
        for y in range(height)
    ]
    for n, pixel in enumerate(raw_pixels):
        if n >= num_pixels:
            break
        # Find the coordinates of the top-left corner of the current tile.
        # n.b. The image is eight tiles wide, and each tile is 8×8 pixels.
        tile_num = n // 64
        tile_y = tile_num // tile_width * 8
        tile_x = tile_num % tile_width * 8
        # Determine the pixel's coordinates within the tile
        # http://en.wikipedia.org/wiki/Z-order_curve#Coordinate_values
        within_tile = n % 64
        sub_x = (
            (within_tile & 0b000001) |
            (within_tile & 0b000100) >> 1 |
            (within_tile & 0b010000) >> 2
        )
        sub_y = (
            (within_tile & 0b000010) >> 1 |
            (within_tile & 0b001000) >> 2 |
            (within_tile & 0b100000) >> 3
        )
        # Add up the pixel's coordinates within the whole image
        x = tile_x + sub_x
        y = tile_y + sub_y
        if x < width and y < height:
            pixels[y][x] = pixel
    return pixels
 def decode_clim(data):
    imag_header = imag_header_struct.parse(data[-20:])
    if imag_header.format not in COLOR_DECODERS:
        raise ValueError(
            "don't know how to decode {} pixels".format(imag_header.format))
    color_decoder, color_bpp, color_depth = COLOR_DECODERS[imag_header.format]
    mode, = struct.unpack_from('<H', data, 0)
    if mode == 2:
        # Paletted
        palette_length, = struct.unpack_from('<H', data, 2)
        palette = list(color_decoder(data, start=4, count=palette_length))
        data_start = 4 + palette_length * color_bpp
        scrambled_pixels = apply_palette(palette, data[data_start:])
    else:
        scrambled_pixels = color_decoder(data)
    pixels = untile_pixels(
        scrambled_pixels,
        imag_header.width,
        imag_header.height,
    )
    return imag_header.width, imag_header.height, color_depth, pixels
--- a/pokedex/extract/lib/garc.py
+++ b/pokedex/extract/lib/garc.py
@ -0,0 +1,307 @@
 """Support for reading the GARC generic container format used in the 3DS
 filesystem.
 Based on code by Zhorken: https://github.com/Zhorken/pokemon-x-y-icons
 and Kaphotics: https://github.com/kwsch/GARCTool
 """
 from io import BytesIO
 from pathlib import Path
 import struct
 import sys
 import construct as c
 from . import lzss3
 from .base import _ContainerFile, Substream
 from .pc import PokemonContainerFile
 def count_bits(n):
    c = 0
    while n:
        c += n & 1
        n >>= 1
    return c
 garc_header_struct = c.Struct(
    'garc_header',
    c.Magic(b'CRAG'),
    c.Const(c.ULInt32('header_size'), 0x1c),
    c.Const(c.ULInt16('byte_order'), 0xfeff),
    c.Const(c.ULInt16('mystery1'), 0x0400),
    c.Const(c.ULInt32('chunks_ct'), 4),
    c.ULInt32('data_offset'),
    c.ULInt32('garc_length'),
    c.ULInt32('last_length'),
 )
 fato_header_struct = c.Struct(
    'fato_header',
    c.Magic(b'OTAF'),
    c.ULInt32('header_size'),
    c.ULInt16('count'),
    c.Const(c.ULInt16('padding'), 0xffff),
    c.Array(
        lambda ctx: ctx.count,
        c.ULInt32('fatb_offsets'),
    ),
 )
 fatb_header_struct = c.Struct(
    'fatb_header',
    c.Magic(b'BTAF'),
    c.ULInt32('fatb_length'),
    c.ULInt32('count'),
 )
 class GARCFile(_ContainerFile):
    def __init__(self, stream):
        self.stream = stream = Substream(stream)
        garc_header = garc_header_struct.parse_stream(self.stream)
        # FATO (file allocation table...  offsets?)
        fato_header = fato_header_struct.parse_stream(self.stream)
        # FATB (file allocation table)
        fatb_header = fatb_header_struct.parse_stream(self.stream)
        fatb_start = garc_header.header_size + fato_header.header_size
        assert stream.tell() == fatb_start + 12
        self.slices = []
        for i, offset in enumerate(fato_header.fatb_offsets):
            stream.seek(fatb_start + offset + 12)
            slices = []
            bits, = struct.unpack('<L', stream.read(4))
            while bits:
                if bits & 1:
                    start, end, length = struct.unpack('<3L', stream.read(12))
                    assert end - 4 < start + length <= end
                    slices.append((garc_header.data_offset + start, length))
                bits >>= 1
            self.slices.append(GARCEntry(stream, slices))
        # FIMB
        stream.seek(fatb_start + fatb_header.fatb_length)
        magic, fimb_header_length, fimb_length = struct.unpack(
            '<4s2L', stream.read(12))
        assert magic == b'BMIF'
        assert fimb_header_length == 0xC
 class GARCEntry(object):
    def __init__(self, stream, slices):
        self.stream = stream
        self.slices = slices
    def __getitem__(self, i):
        start, length = self.slices[i]
        ss = self.stream.slice(start, length)
        if ss.peek(1) in [b'\x10', b'\x11']:
            # XXX this sucks but there's no real way to know for sure whether
            # data is compressed or not.  maybe just bake this into the caller
            # and let them deal with it, same way we do with text decoding?
            # TODO it would be nice if this could be done lazily for 'inspect'
            # purposes, since the first four bytes are enough to tell you the
            # size
            try:
                data = lzss3.decompress_bytes(ss.read())
            except Exception:
                ss.seek(0)
            else:
                return Substream(BytesIO(data))
        return ss
    def __len__(self):
        return len(self.slices)
 XY_CHAR_MAP = {
    0x307f: 0x202f,  # nbsp
    0xe08d: 0x2026,  # ellipsis
    0xe08e: 0x2642,  # female sign
    0xe08f: 0x2640,  # male sign
 }
 XY_VAR_NAMES = {
    0xff00: "COLOR",
    0x0100: "TRNAME",
    0x0101: "PKNAME",
    0x0102: "PKNICK",
    0x0103: "TYPE",
    0x0105: "LOCATION",
    0x0106: "ABILITY",
    0x0107: "MOVE",
    0x0108: "ITEM1",
    0x0109: "ITEM2",
    0x010a: "sTRBAG",
    0x010b: "BOX",
    0x010d: "EVSTAT",
    0x0110: "OPOWER",
    0x0127: "RIBBON",
    0x0134: "MIINAME",
    0x013e: "WEATHER",
    0x0189: "TRNICK",
    0x018a: "1stchrTR",
    0x018b: "SHOUTOUT",
    0x018e: "BERRY",
    0x018f: "REMFEEL",
    0x0190: "REMQUAL",
    0x0191: "WEBSITE",
    0x019c: "CHOICECOS",
    0x01a1: "GSYNCID",
    0x0192: "PRVIDSAY",
    0x0193: "BTLTEST",
    0x0195: "GENLOC",
    0x0199: "CHOICEFOOD",
    0x019a: "HOTELITEM",
    0x019b: "TAXISTOP",
    0x019f: "MAISTITLE",
    0x1000: "ITEMPLUR0",
    0x1001: "ITEMPLUR1",
    0x1100: "GENDBR",
    0x1101: "NUMBRNCH",
    0x1302: "iCOLOR2",
    0x1303: "iCOLOR3",
    0x0200: "NUM1",
    0x0201: "NUM2",
    0x0202: "NUM3",
    0x0203: "NUM4",
    0x0204: "NUM5",
    0x0205: "NUM6",
    0x0206: "NUM7",
    0x0207: "NUM8",
    0x0208: "NUM9",
 }
 def _xy_inner_keygen(key):
    while True:
        yield key
        key = ((key << 3) | (key >> 13)) & 0xffff
 def _xy_outer_keygen():
    key = 0x7c89
    while True:
        yield _xy_inner_keygen(key)
        key = (key + 0x2983) & 0xffff
 def decrypt_xy_text(data):
    text_sections, lines, length, initial_key, section_data = struct.unpack_from(
        '<HHLLl', data)
    outer_keygen = _xy_outer_keygen()
    ret = []
    for i in range(lines):
        keygen = next(outer_keygen)
        s = []
        offset, length = struct.unpack_from('<lh', data, i * 8 + section_data + 4)
        offset += section_data
        start = offset
        characters = []
        for ech in struct.unpack_from("<{}H".format(length), data, offset):
            characters.append(ech ^ next(keygen))
        chiter = iter(characters)
        for c in chiter:
            if c == 0:
                break
            elif c == 0x10:
                # Goofy variable thing
                length = next(chiter)
                typ = next(chiter)
                if typ == 0xbe00:
                    # Pause, then scroll
                    s.append('\r')
                elif typ == 0xbe01:
                    # Pause, then clear screen
                    s.append('\f')
                elif typ == 0xbe02:
                    # Pause for some amount of time?
                    s.append("{{pause:{}}}".format(next(chiter)))
                elif typ == 0xbdff:
                    # Empty text line?  Includes line number, maybe for finding unused lines?
                    s.append("{{blank:{}}}".format(next(chiter)))
                else:
                    s.append("{{{}:{}}}".format(
                        XY_VAR_NAMES.get(typ, "{:04x}".format(typ)),
                        ','.join(str(next(chiter)) for _ in range(length - 1)),
                    ))
            else:
                s.append(chr(XY_CHAR_MAP.get(c, c)))
        ret.append(''.join(s))
    return ret
 def main(args):
    parser = make_arg_parser()
    args = parser.parse_args(args)
    args.cb(args)
 def do_inspect(args):
    with open(args.path, 'rb') as f:
        garc = GARCFile(f)
        for i, topfile in enumerate(garc):
            print("File #{}, {} entr{}".format(
                i, len(topfile), 'y' if len(topfile) == 1 else 'ies'))
            for j, subfile in enumerate(topfile):
                print('   ', j, len(subfile), end='')
                if subfile.peek(2) == b'PC':
                    print(" -- appears to be a PC file (generic container)")
                    pcfile = PokemonContainerFile(subfile)
                    for k, entry in enumerate(pcfile):
                        print('       ', repr(entry.read(50)))
                else:
                    print('', repr(subfile.read(50)))
 def do_extract(args):
    with open(args.path, 'rb') as f:
        garc = GARCFile(f)
        # TODO shouldn't path really be a directory, so you can mass-extract everything?  do i want to do that ever?
        # TODO actually respect mode, fileno, entryno
        for i, topfile in enumerate(garc):
            # TODO i guess this should be a list, or??
            if args.fileno is not all and args.fileno != i:
                continue
            for j, subfile in enumerate(topfile):
                # TODO auto-detect extension, maybe?  depending on mode?
                outfile = Path("{}-{}-{}".format(args.out, i, j))
                with outfile.open('wb') as g:
                    # TODO should use copyfileobj
                    g.write(subfile.read())
                print("wrote", outfile)
 def make_arg_parser():
    from argparse import ArgumentParser
    p = ArgumentParser()
    sp = p.add_subparsers(metavar='command')
    inspect_p = sp.add_parser('inspect', help='examine a particular file')
    inspect_p.set_defaults(cb=do_inspect)
    inspect_p.add_argument('path', help='relative path to a game file')
    inspect_p.add_argument('mode', nargs='?', default='shorthex')
    inspect_p.add_argument('fileno', nargs='?', default=all)
    inspect_p.add_argument('entryno', nargs='?', default=all)
    extract_p = sp.add_parser('extract', help='extract contents of a file')
    extract_p.set_defaults(cb=do_extract)
    extract_p.add_argument('path', help='relative path to a game file')
    extract_p.add_argument('out', help='filename to use for extraction')
    extract_p.add_argument('mode', nargs='?', default='raw')
    extract_p.add_argument('fileno', nargs='?', default=all)
    extract_p.add_argument('entryno', nargs='?', default=all)
    return p
 if __name__ == '__main__':
    main(sys.argv[1:])
--- a/pokedex/extract/lib/lzss3.py
+++ b/pokedex/extract/lib/lzss3.py
@ -0,0 +1,287 @@
 """Support for the LZSS compression format.
 Taken from magical's nlzss project: https://github.com/magical/nlzss
 """
 from __future__ import absolute_import
 from __future__ import print_function
 from __future__ import division
 import sys
 from sys import stdin, stderr, exit
 from os import SEEK_SET, SEEK_CUR, SEEK_END
 from errno import EPIPE
 from struct import pack, unpack
 __all__ = ('decompress', 'decompress_file', 'decompress_bytes',
           'decompress_overlay', 'DecompressionError')
 class DecompressionError(ValueError):
    pass
 def bits(byte):
    return ((byte >> 7) & 1,
            (byte >> 6) & 1,
            (byte >> 5) & 1,
            (byte >> 4) & 1,
            (byte >> 3) & 1,
            (byte >> 2) & 1,
            (byte >> 1) & 1,
            (byte) & 1)
 def decompress_raw_lzss10(indata, decompressed_size, _overlay=False):
    """Decompress LZSS-compressed bytes. Returns a bytearray."""
    data = bytearray()
    it = iter(indata)
    if _overlay:
        disp_extra = 3
    else:
        disp_extra = 1
    def writebyte(b):
        data.append(b)
    def readbyte():
        return next(it)
    def readshort():
        # big-endian
        a = next(it)
        b = next(it)
        return (a << 8) | b
    def copybyte():
        data.append(next(it))
    while len(data) < decompressed_size:
        b = readbyte()
        flags = bits(b)
        for flag in flags:
            if flag == 0:
                copybyte()
            elif flag == 1:
                sh = readshort()
                count = (sh >> 0xc) + 3
                disp = (sh & 0xfff) + disp_extra
                for _ in range(count):
                    writebyte(data[-disp])
            else:
                raise ValueError(flag)
            if decompressed_size <= len(data):
                break
    if len(data) != decompressed_size:
        raise DecompressionError(
            "decompressed size does not match the expected size")
    return data
 def decompress_raw_lzss11(indata, decompressed_size):
    """Decompress LZSS-compressed bytes. Returns a bytearray."""
    data = bytearray()
    it = iter(indata)
    def writebyte(b):
        data.append(b)
    def readbyte():
        return next(it)
    def copybyte():
        data.append(next(it))
    while len(data) < decompressed_size:
        b = readbyte()
        flags = bits(b)
        for flag in flags:
            if flag == 0:
                copybyte()
            elif flag == 1:
                b = readbyte()
                indicator = b >> 4
                if indicator == 0:
                    # 8 bit count, 12 bit disp
                    # indicator is 0, don't need to mask b
                    count = (b << 4)
                    b = readbyte()
                    count += b >> 4
                    count += 0x11
                elif indicator == 1:
                    # 16 bit count, 12 bit disp
                    count = ((b & 0xf) << 12) + (readbyte() << 4)
                    b = readbyte()
                    count += b >> 4
                    count += 0x111
                else:
                    # indicator is count (4 bits), 12 bit disp
                    count = indicator
                    count += 1
                disp = ((b & 0xf) << 8) + readbyte()
                disp += 1
                try:
                    for _ in range(count):
                        writebyte(data[-disp])
                except IndexError:
                    raise Exception(count, disp, len(data), sum(1 for x in it))
            else:
                raise ValueError(flag)
            if decompressed_size <= len(data):
                break
    if len(data) != decompressed_size:
        raise DecompressionError(
            "decompressed size does not match the expected size")
    return data
 def decompress_overlay(f, out):
    # the compression header is at the end of the file
    f.seek(-8, SEEK_END)
    header = f.read(8)
    # decompression goes backwards.
    # end < here < start
    # end_delta == here - decompression end address
    # start_delta == decompression start address - here
    end_delta, start_delta = unpack("<LL", header)
    filelen = f.tell()
    padding = end_delta >> 0x18
    end_delta &= 0xFFFFFF
    decompressed_size = start_delta + end_delta
    f.seek(-end_delta, SEEK_END)
    data = bytearray()
    data.extend(f.read(end_delta - padding))
    data.reverse()
    uncompressed_data = decompress_raw_lzss10(
        data, decompressed_size, _overlay=True)
    uncompressed_data.reverse()
    # first we write up to the portion of the file which was "overwritten" by
    # the decompressed data, then the decompressed data itself.
    # i wonder if it's possible for decompression to overtake the compressed
    # data, so that the decompression code is reading its own output...
    f.seek(0, SEEK_SET)
    out.write(f.read(filelen - end_delta))
    out.write(uncompressed_data)
 def decompress(obj):
    """Decompress LZSS-compressed bytes or a file-like object.
    Shells out to decompress_file() or decompress_bytes() depending on
    whether or not the passed-in object has a 'read' attribute or not.
    Returns a bytearray."""
    if hasattr(obj, 'read'):
        return decompress_file(obj)
    else:
        return decompress_bytes(obj)
 def decompress_bytes(data):
    """Decompress LZSS-compressed bytes. Returns a bytearray."""
    header = data[:4]
    if header[0] == 0x10:
        decompress_raw = decompress_raw_lzss10
    elif header[0] == 0x11:
        decompress_raw = decompress_raw_lzss11
    else:
        raise DecompressionError("not as lzss-compressed file")
    decompressed_size, = unpack("<L", header[1:] + b'\x00')
    data = data[4:]
    return decompress_raw(data, decompressed_size)
 def decompress_file(f):
    """Decompress an LZSS-compressed file. Returns a bytearray.
    This isn't any more efficient than decompress_bytes, as it reads
    the entire file into memory. It is offered as a convenience.
    """
    header = f.read(4)
    if header[0] == 0x10:
        decompress_raw = decompress_raw_lzss10
    elif header[0] == 0x11:
        decompress_raw = decompress_raw_lzss11
    else:
        raise DecompressionError("not as lzss-compressed file")
    decompressed_size, = unpack("<L", header[1:] + b'\x00')
    data = f.read()
    return decompress_raw(data, decompressed_size)
 def main(args=None):
    if args is None:
        args = sys.argv[1:]
    if '--overlay' in args:
        args.remove('--overlay')
        overlay = True
    else:
        overlay = False
    if len(args) < 1 or args[0] == '-':
        if overlay:
            print("Can't decompress overlays from stdin", file=stderr)
            return 2
        if hasattr(stdin, 'detach'):
            f = stdin.detach()
        else:
            f = stdin
    else:
        try:
            f = open(args[0], "rb")
        except IOError as e:
            print(e, file=stderr)
            return 2
    stdout = sys.stdout
    if hasattr(stdout, 'detach'):
        # grab the underlying binary stream
        stdout = stdout.detach()
    try:
        if overlay:
            decompress_overlay(f, stdout)
        else:
            stdout.write(decompress_file(f))
    except IOError as e:
        if e.errno == EPIPE:
            # don't complain about a broken pipe
            pass
        else:
            raise
    except (DecompressionError,) as e:
        print(e, file=stderr)
        return 1
    return 0
 if __name__ == '__main__':
    exit(main())
--- a/pokedex/extract/lib/pc.py
+++ b/pokedex/extract/lib/pc.py
@ -0,0 +1,19 @@
 """Allegedly stands for 'Pokémon Container'.  Completely generic, dead-simple
 container format.
 """
 from .base import _ContainerFile, Substream
 class PokemonContainerFile(_ContainerFile):
    magic = b'PC'
    def __init__(self, stream):
        self.stream = stream = Substream(stream)
        magic, entry_ct = stream.unpack('<2sH')
        assert magic == b'PC'
        self.slices = []
        for _ in range(entry_ct):
            start, end = stream.unpack('<LL')
            self.slices.append(self.stream.slice(start, end - start))
--- a/pokedex/extract/lib/text.py
+++ b/pokedex/extract/lib/text.py
@ -0,0 +1,115 @@
 def merge_japanese_texts(kanji, kana, html=False):
    """Combine a (presuambly equivalent) pair of kanji and kana strings into a
    single string of kanji with furigana.
    If `html` is truthy, the return value will contain HTML ruby tags;
    otherwise it will use the Unicode "interlinear annotation" characters.
    This relies on the Needleman–Wunsch algorithm for sequence alignment:
    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
    """
    # TODO maybe this is faster, but then -1 doesn't work
    #table = [
    #    [None for _ in range(len(kana))]
    #    for _ in range(len(kanji))
    #]
    table = {}
    # continue left, continue up, are the characters equivalent, score for this
    # cell
    table[-1, -1] = False, False, True, 0
    isjunk = {}
    for ch in kanji + kana:
        isjunk[ch] = ch.isspace() or ch in '。␤'
    # initialize, TODO, something about scoring compared to a gap
    for i, ch in enumerate(kanji):
        table[i, -1] = True, False, False, -1 - i
    for i, ch in enumerate(kana):
        table[-1, i] = False, True, False, -1 - i
    for a, ach in enumerate(kanji):
        for b, bch in enumerate(kana):
            options = []
            # Continue diagonally means two characters together, either a match
            # or a mismatch
            if ach == bch or (isjunk[ach] and isjunk[bch]):
                equiv = True
                score = 1
            else:
                equiv = False
                score = -1
            options.append((True, True, equiv, table[a - 1, b - 1][2] + score))
            # Continue from or side means an indel...  -1
            if isjunk[ach]:
                score = 0
            else:
                score = -1
            options.append((True, False, equiv, table[a - 1, b][2] + score))
            if isjunk[bch]:
                score = 0
            else:
                score = -1
            options.append((False, True, equiv, table[a, b - 1][2] + score))
            # Strictly speaking, in the case of a tie, all of the "best"
            # choices are supposed to be preserved.  But we should never have a
            # tie, and we have an arbitrary choice of which to use in the end
            # anyway, so screw it.
            table[a, b] = max(options, key=lambda opt: opt[2])
    if html:
        ruby_format = "<ruby><rb>{}</rb><rt>{}</rt></ruby>"
    else:
        ruby_format = "\ufff9{}\ufffa{}\ufffb"
    def add_mismatches(mismatch_a, mismatch_b, final):
        # Need to pop out any extra junk characters at the beginning or end --
        # but only the kanji ones stay, since kanji is "canonical"
        while mismatch_a and isjunk[mismatch_a[0]]:
            final.append(mismatch_a.pop(0))
        while mismatch_b and isjunk[mismatch_b[0]]:
            mismatch_b.pop(0)
        endjunk = []
        while mismatch_a and isjunk[mismatch_a[-1]]:
            endjunk.append(mismatch_a.pop())
        while mismatch_b and isjunk[mismatch_b[-1]]:
            mismatch_b.pop()
        final.append(ruby_format.format(
            ''.join(reversed(mismatch_a)),
            ''.join(reversed(mismatch_b)),
        ))
        final.extend(endjunk)
        del mismatch_a[:]
        del mismatch_b[:]
    final = []
    mismatch_a = []
    mismatch_b = []
    a = len(kanji) - 1
    b = len(kana) - 1
    while True:
        walk_left, walk_up, equiv, score = table[a, b]
        if walk_left and walk_up:
            if equiv:
                if mismatch_a or mismatch_b:
                    add_mismatches(mismatch_a, mismatch_b, final)
                final.append(kanji[a])
            else:
                mismatch_a.append(kanji[a])
                mismatch_b.append(kana[b])
            a -= 1
            b -= 1
        elif walk_left:
            mismatch_a.append(kanji[a])
            a -= 1
        elif walk_up:
            mismatch_b.append(kana[b])
            b -= 1
        else:
            break
    if mismatch_a or mismatch_b:
        add_mismatches(mismatch_a, mismatch_b, final)
    return ''.join(reversed(final))
--- a/pokedex/extract/oras.py
+++ b/pokedex/extract/oras.py
@ -0,0 +1,844 @@
 """Dumps data from Omega Ruby and Alpha Sapphire.
 Filesystem reference: http://www.projectpokemon.org/wiki/ORAS_File_System
 """
 import argparse
 from collections import OrderedDict
 from contextlib import contextmanager
 import itertools
 import math
 from pathlib import Path
 import shutil
 import struct
 from construct import Array, BitField, Bitwise, Magic, OptionalGreedyRange, Padding, Pointer, Struct, SLInt8, SLInt16, ULInt8, ULInt16, ULInt32
 import yaml
 from .lib.garc import GARCFile, decrypt_xy_text
 from .lib.text import merge_japanese_texts
 # TODO fix some hardcoding in here
 # TODO finish converting garc parsing to use construct, if possible, i think (i would not miss substream)
 # way way more sprite work in here...
 CANON_LANGUAGES = ('ja', 'en', 'fr', 'it', 'de', 'es', 'ko')
 ORAS_SCRIPT_FILES = {
    'ja-kana': 'rom/a/0/7/1',
    'ja-kanji': 'rom/a/0/7/2',
    'en': 'rom/a/0/7/3',
    'fr': 'rom/a/0/7/4',
    'it': 'rom/a/0/7/5',
    'de': 'rom/a/0/7/6',
    'es': 'rom/a/0/7/7',
    'ko': 'rom/a/0/7/8',
 }
 ORAS_SCRIPT_ENTRIES = {
    'form-names': 5,
    # TODO these might be backwards, i'm just guessing
    'species-flavor-alpha-sapphire': 6,
    'species-flavor-omega-ruby': 7,
    'move-contest-flavor': 13,
    'move-names': 14,
    # Note: table 15 is also a list of move names, but with a few at the end
    # missing?  XY leftovers?
    'move-flavor': 16,
    'type-names': 18,
    'ability-flavor': 36,
    'ability-names': 37,
    'nature-names': 51,
    'species-names': 98,
 }
 # The first element in each list is the name of the BASE form -- if it's not
 # None, the base form will be saved under two filenames
 ORAS_EXTRA_SPRITE_NAMES = {
    # Cosplay Pikachu
    25: (None, 'rockstar', 'belle', 'popstar', 'phd', 'libre', 'cosplay'),
    # Unown
    201: tuple('abcdefghijklmnopqrstuvwxyz') + ('exclamation', 'question'),
    # Castform
    351: (None, 'sunny', 'rainy', 'snowy'),
    # Kyogre and Groudon
    382: (None, 'primal',),
    383: (None, 'primal',),
    # Deoxys
    386: ('normal', 'attack', 'defense', 'speed'),
    # Burmy and Wormadam
    412: ('plant', 'sandy', 'trash'),
    413: ('plant', 'sandy', 'trash'),
    # Cherrim
    421: ('overcast', 'sunshine',),
    # Shellos and Gastrodon
    422: ('west', 'east',),
    423: ('west', 'east',),
    # Rotom
    479: (None, 'heat', 'wash', 'frost', 'fan', 'mow'),
    # Giratina
    487: ('altered', 'origin',),
    # Shaymin
    492: ('land', 'sky',),
    # Arceus
    493: (
        'normal', 'fighting', 'flying', 'poison', 'ground', 'rock', 'bug',
        'ghost', 'steel', 'fire', 'water', 'grass', 'electric', 'psychic',
        'ice', 'dragon', 'dark', 'fairy',
    ),
    # Basculin
    550: ('red-striped', 'blue-striped',),
    # Darmanitan
    555: ('standard', 'zen',),
    # Deerling and Sawsbuck
    585: ('sprint', 'summer', 'autumn', 'winter'),
    586: ('sprint', 'summer', 'autumn', 'winter'),
    # Tornadus, Thundurus, and Landorus
    641: ('incarnate', 'therian'),
    642: ('incarnate', 'therian'),
    645: ('incarnate', 'therian'),
    # Kyurem
    646: (None, 'white', 'black'),
    # Keldeo
    647: ('ordinary', 'resolute'),
    # Meloetta
    648: ('aria', 'pirouette'),
    # Genesect
    649: (None, 'douse', 'shock', 'burn', 'chill'),
    # Vivillon
    666: (
        'icy-snow', 'polar', 'tundra', 'continental', 'garden', 'elegant',
        'meadow', 'modern', 'marine', 'archipelago', 'high-plains',
        'sandstorm', 'river', 'monsoon', 'savanna', 'sun', 'ocean', 'jungle',
        'fancy', 'poke-ball',
    ),
    # Flabébé/Floette/Florges
    669: ('red', 'yellow', 'orange', 'blue', 'white'),
    670: ('red', 'yellow', 'orange', 'blue', 'white', 'eternal'),
    671: ('red', 'yellow', 'orange', 'blue', 'white'),
    # Furfrou
    676: (
        'natural', 'heart', 'star', 'diamond', 'debutante', 'matron', 'dandy',
        'la-reine', 'kabuki', 'pharaoh',
    ),
    # Meowstic
    #678: [male, female]
    # Aegislash
    681: ('shield', 'blade'),
    # Pumpkaboo/Gourgeist
    710: ('average', 'small', 'large', 'super'),
    711: ('average', 'small', 'large', 'super'),
    # Xerneas
    716: ('neutral', 'active'),
    # Hoopa
    720: ('confined', 'unbound'),
 }
 pokemon_struct = Struct(
    'pokemon',
    ULInt8('stat_hp'),
    ULInt8('stat_atk'),
    ULInt8('stat_def'),
    ULInt8('stat_speed'),
    ULInt8('stat_spatk'),
    ULInt8('stat_spdef'),
    ULInt8('type1'),
    ULInt8('type2'),
    ULInt8('catch_rate'),
    ULInt8('stage'),
    ULInt16('effort'),
    ULInt16('held_item1'),
    ULInt16('held_item2'),
    ULInt16('held_item3'),  # dark grass from bw, unused in oras?
    ULInt8('gender_rate'),
    ULInt8('steps_to_hatch'),
    ULInt8('base_happiness'),
    ULInt8('exp_curve'),
    ULInt8('egg_group1'),
    ULInt8('egg_group2'),
    ULInt8('ability1'),
    ULInt8('ability2'),
    ULInt8('ability_dream'),
    ULInt8('safari_escape'),
    ULInt16('form_species_start'),
    ULInt16('form_sprite_start'),
    ULInt8('form_count'),
    ULInt8('color'),
    ULInt16('base_exp'),
    ULInt16('height'),
    ULInt16('weight'),
    Bitwise(
        BitField('machines', 14 * 8, swapped=True),
    ),
    Padding(2),
    ULInt32('tutors'),
    ULInt16('mystery1'),
    ULInt16('mystery2'),
    ULInt32('bp_tutors1'),
    ULInt32('bp_tutors2'),
    ULInt32('bp_tutors3'),
    ULInt32('bp_tutors4'),
 )
 pokemon_mega_evolutions_struct = Array(
    3,
    Struct(
        'pokemon_mega_evolutions',
        ULInt16('number'),
        ULInt16('mode'),
        ULInt16('mega_stone_itemid'),
        Padding(2),
    )
 )
 egg_moves_struct = Struct(
    'egg_moves',
    ULInt16('count'),
    Array(
        lambda ctx: ctx.count,
        ULInt16('moveids'),
    ),
 )
 level_up_moves_struct = OptionalGreedyRange(
    Struct(
        'level_up_pair',
        SLInt16('moveid'),
        SLInt16('level'),
    ),
 )
 move_struct = Struct(
    'move',
    ULInt8('type'),
    ULInt8('category'),
    ULInt8('damage_class'),
    ULInt8('power'),
    ULInt8('accuracy'),
    ULInt8('pp'),
    SLInt8('priority'),
    ULInt8('min_max_hits'),
    SLInt16('caused_effect'),
    ULInt8('effect_chance'),
    ULInt8('status'),
    ULInt8('min_turns'),
    ULInt8('max_turns'),
    ULInt8('crit_rate'),
    ULInt8('flinch_chance'),
    ULInt16('effect'),
    SLInt8('recoil'),
    ULInt8('healing'),
    ULInt8('range'),            # ok
    Bitwise(
        BitField('stat_change', 24),
    ),
    Bitwise(
        BitField('stat_amount', 24),
    ),
    Bitwise(
        BitField('stat_chance', 24),
    ),
    ULInt8('padding0'),         # ok
    ULInt8('padding1'),         # ok
    ULInt16('flags'),
    ULInt8('padding2'),         # ok
    ULInt8('extra'),
 )
 move_container_struct = Struct(
    'move_container',
    Magic(b'WD'),  # waza...  descriptions?
    ULInt16('record_ct'),
    Array(
        lambda ctx: ctx.record_ct,
        Struct(
            'records',
            ULInt32('offset'),
            Pointer(lambda ctx: ctx.offset, move_struct),
        ),
    ),
 )
 pokemon_sprite_struct = Struct(
    'pokemon_sprite_config',
    ULInt16('index'),
    ULInt16('female_index'),
    ULInt32('form_index_offset'),
    ULInt32('right_index_offset'),
    ULInt16('form_count'),
    ULInt16('right_count'),
 )
 # There are 63 tutor move bits in use, but only 60 move tutors -- the moves
 # appear to be largely inherited from B2W2 but these are just not exposed in
 # ORAS
 ORAS_UNUSED_MOVE_TUTORS = {'dark-pulse', 'roost', 'sleep-talk'}
 # Unsure where this is in the binary
 ORAS_NORMAL_MOVE_TUTORS = (
    'grass-pledge',
    'fire-pledge',
    'water-pledge',
    'frenzy-plant',
    'blast-burn',
    'hydro-cannon',
    'draco-meteor',
    'dragon-ascent',
 )
@contextmanager
 def read_garc(path):
    with path.open('rb') as f:
        yield GARCFile(f)
 # XXX christ lol.  taken from SO.  fodder for camel maybe
 def represent_ordereddict(dumper, data):
    value = []
    for item_key, item_value in data.items():
        node_key = dumper.represent_data(item_key)
        node_value = dumper.represent_data(item_value)
        value.append((node_key, node_value))
    return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)
 yaml.add_representer(OrderedDict, represent_ordereddict)
 def represent_tuple(dumper, data):
    return yaml.nodes.SequenceNode(
        u'tag:yaml.org,2002:seq',
        [dumper.represent_data(item) for item in data],
        flow_style=True,
    )
 yaml.add_representer(tuple, represent_tuple)
 def dump_to_yaml(data, f):
    # TODO gonna need a better way to handle flow style
    yaml.dump(
        data, f,
        default_flow_style=False,
        allow_unicode=True,
    )
 def extract_data(root, out):
    # TODO big conceptual question for the yaml thing: how do we decide how the
    # identifiers work in the per-version data?  the "global" identifiers are
    # in theory based on the names from the latest version, and the game dump
    # scripts shouldn't have to care about what the latest version is
    # 1. make the canon data not be keyed by identifier (makes it hard to
    # follow what's going on in flavor text files etc, and unclear how to match
    # up items across versions)
    # 2. make each version's data keyed by its own identifiers (makes it hard
    # to align them all when loading everything, and unclear how to match up
    # items whose names change across versions)
    # 3. hardcode a mapping of version+identifier pairs to their current
    # identifiers, when they changed, which is a little ugly but also solves
    # all the match-up problems and is what we'd basically have to do anyway
    # -------------------------------------------------------------------------
    # Names and flavor text
    texts = {}
    for lang, fn in ORAS_SCRIPT_FILES.items():
        texts[lang] = {}
        with read_garc(root / fn) as garc:
            for entryname, entryid in ORAS_SCRIPT_ENTRIES.items():
                entry = garc[entryid][0]
                texts[lang][entryname] = decrypt_xy_text(entry.read())
    # Japanese text is special!  It's written in both kanji and kana, and we
    # want to combine them
    texts['ja'] = {}
    for entryname in ORAS_SCRIPT_ENTRIES:
        kanjis = texts['ja-kanji'][entryname]
        kanas = texts['ja-kana'][entryname]
        # But not if they're names of things.
        # (TODO this might not be true in the case of, say, towns?  in which
        # case, what do we do?  we want to ultimately put these in urls and
        # whatnot, right, but we don't want furigana there  :S  do we need a
        # separate "identifier" field /per language/?)
        if entryname.endswith('names'):
            assert kanjis == kanas
            texts['ja'][entryname] = kanjis
        else:
            assert len(kanas) == len(kanjis)
            texts['ja'][entryname] = [
                merge_japanese_texts(kanji, kana)
                for (kanji, kana) in zip(kanjis, kanas)
            ]
    del texts['ja-kanji']
    del texts['ja-kana']
    identifiers = {}
    identifiers['species'] = [
        # TODO better identifier creation, to be determined later, but surely
        # want to lose . and '
        # TODO handling forms here is awkward since the form names are
        # contained in the personal struct
        ((species_name or '') + '-' + form_name).lower().replace(' ', '-')
        for (species_name, form_name) in itertools.zip_longest(
            texts['en']['species-names'],
            texts['en']['form-names'],
        )
    ]
    identifiers['move'] = [
        # TODO better identifier creation, to be determined later, but surely
        # want to lose . and '
        name.lower().replace(' ', '-')
        for name in texts['en']['move-names']
    ]
    textdir = out / 'script'
    if not textdir.exists():
        textdir.mkdir()
    for lang in CANON_LANGUAGES:
        with (textdir / (lang + '.yaml')).open('w') as f:
            # TODO this should use identifiers, not be lists
            # TODO need to skip slot 0 which is junk
            dump_to_yaml(texts[lang], f)
    # -------------------------------------------------------------------------
    # Scrape some useful bits from the binary
    with (root / 'exe/code.bin').open('rb') as f:
        # Tutored moves
        tutor_moves = dict(tutors=ORAS_NORMAL_MOVE_TUTORS)
        f.seek(0x004960f8)
        for n in range(1, 5):
            key = "bp_tutors{}".format(n)
            moves = tutor_moves[key] = []
            while True:
                moveid, = struct.unpack('<H', f.read(2))
                if moveid >= len(identifiers['move']):
                    break
                moves.append(identifiers['move'][moveid])
        # TMs
        machines = []
        f.seek(0x004a67ee)
        machineids = struct.unpack('<107H', f.read(2 * 107))
        # Order appears to be based on some gen 4 legacy: TMs 1 through 92, HMs
        # 1 through 6, then the other eight TMs and the last HM.  But the bits
        # in the Pokémon structs are in the expected order of 1 through 100, 1
        # through 7
        machines = [
            identifiers['move'][moveid]
            for moveid in
                machineids[0:92] +
                machineids[98:106] +
                machineids[92:98] +
                machineids[106:]
        ]
    # -------------------------------------------------------------------------
    # Pokémon structs
    pokemon_data = []
    with read_garc(root / 'rom/a/1/9/5') as garc:
        personals = [subfile[0].read() for subfile in garc]
    _pokemon_forms = {}  # "real" species id => (base species id, form name id)
    _next_name_form_id = 723
    for i, personal in enumerate(personals[:-1]):
        record = pokemon_struct.parse(personal)
        # TODO transform to an OD somehow probably
        pokemon_data.append(record)
        #print("{:3d} {:15s} {} {:5d} {:5d}".format(
        #    i,
        #    identifiers['species'][baseid],
        #    ('0'*16 + bin(record.mystery1)[2:])[-16:],
        #    record.mystery2,
        #    record.stage,
        #))
        # TODO some pokemon have sprite starts but no species start, because their sprites vary obv
        if record.form_count > 1:
            # The form names appear to be all just jammed at the end in order,
            # completely unrelated to either of the "start" offsets here
            for offset in range(record.form_count - 1):
                #form_name = texts['en']['form-names'][_next_name_form_id]
                if record.form_species_start:
                    # TODO still no idea how "intangible" forms are being
                    # handled in the new schema
                    _pokemon_forms[record.form_species_start + offset] = i, _next_name_form_id
                _next_name_form_id += 1
        if record.form_species_start:
            for offset in range(record.form_count - 1):
                # TODO grab the form names argh
                identifiers['species'][record.form_species_start + offset] = identifiers['species'][i]
    #for i in range(723, 825 + 1):
    #    base_species_id, form_name_id = _pokemon_forms[i]
    #    species_name = texts['en']['species-names'][base_species_id]
    #    form_name = texts['en']['form-names'][form_name_id]
    #    print(i, species_name, '/', form_name)
    # -------------------------------------------------------------------------
    # Move stats
    movesets = OrderedDict()
    with read_garc(root / 'rom/a/1/8/9') as garc:
        # Only one subfile
        data = garc[0][0].read()
        container = move_container_struct.parse(data)
        for n, record in enumerate(container.records):
            m = record.move
            # TODO with the release of oras all moves have contest types and effects again!  where are they??
            #print("{:3d} {:20s} | {m.type:3d} {m.power:3d} {m.pp:2d} {m.accuracy:3d} / {m.priority:2d} {m.range:2d} {m.damage_class:1d} / {m.effect:3d} {m.caused_effect:3d} {m.effect_chance:3d}  --  {m.status:3d} {m.min_turns:3d} {m.max_turns:3d} {m.crit_rate:3d} {m.flinch_chance:3d} {m.recoil:4d} {m.healing:3d} / {m.stat_change:06x} {m.stat_amount:06x} {m.stat_chance:06x} / {m.padding0:3d} {m.padding1:3d} {m.flags:04x} {m.padding2:3d} {m.extra:3d}".format(
            #    n,
            #    identifiers['move'][n],
            #    m=record.move,
            #))
    # Egg moves
    with read_garc(root / 'rom/a/1/9/0') as garc:
        for i, subfile in enumerate(garc):
            ident = identifiers['species'][i]
            data = subfile[0].read()
            if not data:
                continue
            container = egg_moves_struct.parse(data)
            moveset = movesets.setdefault(ident, OrderedDict())
            eggset = moveset['egg'] = []
            for moveid in container.moveids:
                eggset.append(identifiers['move'][moveid])
    # Level-up moves
    with read_garc(root / 'rom/a/1/9/1') as garc:
        for i, subfile in enumerate(garc):
            ident = identifiers['species'][i]
            level_up_moves = subfile[0].read()
            moveset = movesets.setdefault(ident, OrderedDict())
            levelset = moveset['level'] = []
            lastlevel = None
            order = 1
            for pair in level_up_moves_struct.parse(level_up_moves):
                # End is indicated with -1, -1
                if pair.moveid <= 0:
                    break
                levelset.append((
                    pair.level,
                    identifiers['move'][pair.moveid],
                ))
                if pair.level == lastlevel:
                    order += 1
                else:
                    lastlevel = pair.level
                    order = 1
    # Evolution
    #with read_garc(root / 'rom/a/1/9/2') as garc:
    #    for subfile in garc:
    #        evolution = subfile[0].read()
    #        print(repr(evolution))
    # Mega evolution
    #with read_garc(root / 'rom/a/1/9/3') as garc:
    #    for subfile in garc:
    #        evolution = subfile[0].read()
    #        print(repr(evolution))
    # TODO what is a/1/9/4?  8 files of 404 bytes each
    # Baby Pokémon
    #with read_garc(root / 'rom/a/1/9/6') as garc:
    #    for subfile in garc:
    #        baby_pokemon = subfile[0].read()
    #        print(repr(baby_pokemon))
    # Item stats
    #with read_garc(root / 'rom/a/1/9/7') as garc:
    #    for subfile in garc:
    #        item_stats = subfile[0].read()
    #        print(repr(item_stats))
    # Tutor moves (from the personal structs)
    for i, datum in enumerate(pokemon_data):
        ident = identifiers['species'][i]
        moveset = movesets.setdefault(ident, OrderedDict())
        tutorset = moveset['tutor'] = []
        for key, tutors in tutor_moves.items():
            for bit, moveident in enumerate(tutors):
                if moveident in ORAS_UNUSED_MOVE_TUTORS:
                    continue
                if not datum[key] & (1 << bit):
                    continue
                tutorset.append(moveident)
        # TMs
        machineset = moveset['machine'] = []
        for bit, moveident in enumerate(machines):
            if not datum['machines'] & (1 << bit):
                continue
            machineset.append(moveident)
    with (out / 'movesets.yaml').open('w') as f:
        dump_to_yaml(movesets, f)
 def extract_box_sprites(root, out):
    filenames = {}
    with (root / 'exe/code.bin').open('rb') as f:
        # Form configuration, used to put sprites in the right order
        # NOTE: in x/y the address is 0x0043ea98
        f.seek(0x0047d650)
        # TODO need to do a different thing for main sprites
        # TODO magic number
        for n in range(722):
            sprite = pokemon_sprite_struct.parse_stream(f)
            assert sprite.index not in filenames
            filenames[sprite.index] = "{}".format(n)
            if sprite.female_index != sprite.index:
                assert sprite.female_index not in filenames
                filenames[sprite.female_index] = "{}-female".format(n)
            # Note that these addresses are relative to RAM, and the binary is
            # loaded into RAM starting at 0x100000, so we need to subtract that
            # to get a file position
            pos = f.tell()
            form_indices = ()
            right_indices = ()
            if sprite.form_index_offset:
                f.seek(sprite.form_index_offset - 0x100000)
                form_indices = struct.unpack(
                    "<{}H".format(sprite.form_count),
                    f.read(2 * sprite.form_count),
                )
                for form, form_idx in enumerate(form_indices):
                    # Ignore the first form, since it's the default and thus
                    # covered by `index` already
                    if form == 0:
                        continue
                    if form_idx == sprite.index:
                        continue
                    assert form_idx not in filenames
                    filenames[form_idx] = "{}-form{}".format(n, form)
            if sprite.right_index_offset:
                f.seek(sprite.right_index_offset - 0x100000)
                right_indices = struct.unpack(
                    "<{}H".format(sprite.right_count),
                    f.read(2 * sprite.right_count),
                )
                if sprite.form_count:
                    assert sprite.right_count == sprite.form_count
                    for form, (form_idx, right_idx) in enumerate(zip(form_indices, right_indices)):
                        if form_idx == right_idx:
                            continue
                        if form != 0:
                            suffix = "form{}-right".format(form)
                        else:
                            suffix = 'right'
                        assert right_idx not in filenames
                        filenames[right_idx] = "{}-{}".format(n, suffix)
                else:
                    assert sprite.right_count == 2
                    assert right_indices[0] == right_indices[1]
                    if right_indices[0] != sprite.index:
                        assert right_indices[0] not in filenames
                        filenames[right_indices[0]] = "{}-right".format(n)
            f.seek(pos)
    pokemon_sprites_dir = out
    if not pokemon_sprites_dir.exists():
        pokemon_sprites_dir.mkdir()
    with read_garc(root / 'rom/a/0/9/1') as garc:
        from .lib.clim import decode_clim
        for i, subfile in enumerate(garc):
            if i == 0:
                # Dummy blank sprite, not interesting to us
                continue
            elif i in filenames:
                filename = filenames[i] + '.png'
            elif i == len(garc) - 1:
                # Very last one is egg
                filename = 'egg.png'
            else:
                # This is a duplicate Entei sprite that's not used
                assert i in (333,)
                continue
            data = subfile[0].read()
            width, height, color_depth, pixels = decode_clim(data)
            png_writer = png.Writer(
                width=width,
                height=height,
                alpha=True,
            )
            # this library is so fucking stupid
            # TODO strictly speaking we could just write out a paletted PNG directly
            # TODO add sBIT chunk indicating original bit depth
            with (pokemon_sprites_dir / filename).open('wb') as f:
                png_writer.write(f, (itertools.chain(*row) for row in pixels))
 def extract_dex_sprites(root, out):
    # Some Pokémon have dex sprites for their forms, too, and they're all
    # clustered together, so we have to do a little work to fix the numbering.
    # Luckily the dex sprites are in the same order as the models
    # (unsurprising, as they're just model renders), which also tells us what
    # Pokémon have female forms.  The mega evolution map tells us which forms
    # are megas, and the rest are listed manually above as
    # ORAS_EXTRA_SPRITE_NAMES.
    # Grab the list of megas first
    num_megas = {}  # pokemonid => number of mega evos
    with read_garc(root / 'rom/a/1/9/3') as garc:
        for pokemonid, subfile in enumerate(garc):
            mega_evos = pokemon_mega_evolutions_struct.parse_stream(subfile[0])
            num_megas[pokemonid] = max(
                mega_evo.number for mega_evo in mega_evos)
    # Then construct filenames, using num_megas plus information from the model
    # index
    filenames = {}  # model/sprite number => filename, sans extension
    duplicate_filenames = []  # pairs of (copy from, copy to)
    with read_garc(root / 'rom/a/0/0/8') as garc:
        f = garc[0][0]
        # TODO magic number
        for n in range(721):
            # Unlike /virtually everywhere else/, Pokémon are zero-indexed here
            pokemonid = n + 1
            # Index of the first model (also zero-indexed), how many models the
            # Pokémon has, and some flags
            start, count, flags = struct.unpack('<HBB', f.read(4))
            model_num = start + 1
            # For some asinine reason, Xerneas is counted as two separate
            # Pokémon in the dex sprites but not the models, so we have to
            # shift everything after it back by 1
            if pokemonid == 716:
                count = 2
            elif pokemonid >= 717:
                model_num += 1
            filenames[model_num] = str(pokemonid)
            form_count = count - 1  # discount "base" form
            total_model_count = model_num + count - 1
            # Some "forms" have no real default, so we save the sprite both as
            # nnn.png and nnn-form.png, to guarantee that nnn.png always exists
            if pokemonid in ORAS_EXTRA_SPRITE_NAMES:
                if ORAS_EXTRA_SPRITE_NAMES[pokemonid][0] is not None:
                    duplicate_filenames.append((
                        str(pokemonid),
                        "{}-{}".format(
                            pokemonid, ORAS_EXTRA_SPRITE_NAMES[pokemonid][0]),
                    ))
            # Don't know what flag 1 is; everything has it.
            # Flag 2 means the first alternate form is a female variant.
            if flags & 2:
                assert form_count > 0
                form_count -= 1
                model_num += 1
                filenames[model_num] = "female/{}".format(pokemonid)
            # Flag 4 just means there are more forms?
            if flags & 4:
                assert form_count
            assert 1 or 1 == sum((
                form_count == 0,
                num_megas[pokemonid] > 0,
                pokemonid in ORAS_EXTRA_SPRITE_NAMES,
            ))
            if num_megas[pokemonid]:
                assert form_count == num_megas[pokemonid]
                assert pokemonid not in ORAS_EXTRA_SPRITE_NAMES
                model_num += 1
                if form_count == 1:
                    filenames[model_num] = "{}-mega".format(pokemonid)
                else:
                    # Charizard and Mewtwo
                    assert form_count == 2
                    filenames[model_num] = "{}-mega-x".format(pokemonid)
                    filenames[model_num + 1] = "{}-mega-y".format(pokemonid)
            elif pokemonid in ORAS_EXTRA_SPRITE_NAMES:
                for form_name in ORAS_EXTRA_SPRITE_NAMES[pokemonid][1:]:
                    model_num += 1
                    filenames[model_num] = "{}-{}".format(pokemonid, form_name)
    # And now, do the ripping
    # TODO This will save Unown A as 201.png, and not create a 201-a.png
    pokemon_sprites_dir = out
    with read_garc(root / 'rom/a/2/6/3') as garc:
        from .lib.clim import decode_clim
        for i, subfile in enumerate(garc):
            shiny_prefix = ''
            if i > total_model_count:
                i -= total_model_count
                shiny_prefix = 'shiny/'
            if i == 0:
                # Dummy blank sprite, not interesting to us
                continue
            elif 37 <= i <= 41:
                # Cosplay Pikachu's outfits -- the sprites are blank, so saving
                # these is not particularly useful
                continue
            elif i in filenames:
                filename = shiny_prefix + filenames[i] + '.png'
            else:
                raise ValueError(
                    "Can't find a filename for sprite number {}".format(i))
            data = subfile[0].read()
            width, height, color_depth, pixels = decode_clim(data)
            png_writer = png.Writer(
                width=width,
                height=height,
                alpha=True,
            )
            # this library is so fucking stupid
            # TODO strictly speaking we could just write out a paletted PNG directly
            # TODO add sBIT chunk indicating original bit depth
            path = pokemon_sprites_dir / filename
            parent = path.parent
            if not parent.exists():
                parent.mkdir(parents=False)
            with path.open('wb') as f:
                png_writer.write(f, (itertools.chain(*row) for row in pixels))
    for source, dest in duplicate_filenames:
        shutil.copyfile(
            str(pokemon_sprites_dir / source) + '.png',
            str(pokemon_sprites_dir / dest) + '.png',
        )
 def _munge_source_arg(strpath):
    path = Path(strpath)
    if not path.is_dir():
        raise argparse.ArgumentTypeError(
            "{!r} is not a directory".format(strpath))
    # TODO something something romfs, exefs
    return path
 def make_arg_parser():
    p = argparse.ArgumentParser()
    p.add_argument('what', choices=('data', 'dex-sprites', 'box-sprites'), help='what to extract')
    # TODO should verify that this is an actual game dump, and find the rom/exe
    p.add_argument('source', type=_munge_source_arg, help='path to an unpacked game image')
    p.add_argument('dest', type=_munge_source_arg, help='directory to dump the results into')
    return p
 def main(args):
    parser = make_arg_parser()
    args = parser.parse_args(args)
    # TODO support 'all', and just make some subdirectories per thing
    # TODO or maybe merge all the sprite things together since stuff will need moving around anyway idk
    if args.what == 'data':
        extract_data(args.source, args.dest)
    elif args.what == 'dex-sprites':
        extract_dex_sprites(args.source, args.dest)
    elif args.what == 'box-sprites':
        extract_box_sprites(args.source, args.dest)
 if __name__ == '__main__':
    import sys
    main(sys.argv[1:])