Initial gen6-to-yaml ripping stuff

2024-08-20 18:16:34 +00:00 · 2016-02-26 10:05:51 -08:00 · 2016-02-26 10:05:51 -08:00 · 949eafb957
commit 949eafb957
parent 54ea67a804
9 changed files with 1841 additions and 0 deletions
--- a/pokedex/extract/init.py
+++ b/pokedex/extract/init.py
--- a/pokedex/extract/lib/init.py
+++ b/pokedex/extract/lib/init.py
--- a/pokedex/extract/lib/base.py
+++ b/pokedex/extract/lib/base.py
@ -0,0 +1,87 @@
+"""Base or helper classes used a lot for dealing with file formats.
+"""
+import io
+import struct
+
+
+class Substream:
+    """Wraps a stream and pretends it starts at an offset other than 0.
+
+    Partly implements the file interface.
+
+    This type always seeks before reading, but doesn't do so afterwards, so
+    interleaving reads with the underlying stream may not do what you want.
+    """
+    def __init__(self, stream, offset=0, length=-1):
+        if isinstance(stream, Substream):
+            self.stream = stream.stream
+            self.offset = offset + stream.offset
+        else:
+            self.stream = stream
+            self.offset = offset
+
+        self.length = length
+        self.pos = 0
+
+    def __repr__(self):
+        return "<{} of {} at {}>".format(
+            type(self).__name__, self.stream, self.offset)
+
+    def read(self, n=-1):
+        self.stream.seek(self.offset + self.pos)
+        if n < 0:
+            n = self.length
+        elif self.length >= 0 and n > self.length:
+            n = self.length
+        data = self.stream.read(n)
+        self.pos += len(data)
+        return data
+
+    def seek(self, offset):
+        offset = max(offset, 0)
+        if self.length >= 0:
+            offset = min(offset, self.length)
+        self.stream.seek(self.offset + offset)
+        self.pos = self.tell()
+
+    def tell(self):
+        return self.stream.tell() - self.offset
+
+    def __len__(self):
+        if self.length < 0:
+            pos = self.stream.tell()
+            self.stream.seek(0, io.SEEK_END)
+            parent_length = self.stream.tell()
+            self.stream.seek(pos)
+            return parent_length - self.offset
+        else:
+            return self.length
+
+    def peek(self, n):
+        pos = self.stream.tell()
+        self.stream.seek(self.offset + self.pos)
+        data = self.stream.read(n)
+        self.stream.seek(pos)
+        return data
+
+    def unpack(self, fmt):
+        """Unpacks a struct format from the current position in the stream."""
+        data = self.read(struct.calcsize(fmt))
+        return struct.unpack(fmt, data)
+
+    def slice(self, offset, length=-1):
+        # TODO limit or warn if length is too long for this slice?
+        return Substream(self, self.offset + offset, length)
+
+
+class _ContainerFile:
+    slices = ()
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __iter__(self):
+        return iter(self.slices)
+
+    def __getitem__(self, key):
+        return self.slices[key]
--- a/pokedex/extract/lib/clim.py
+++ b/pokedex/extract/lib/clim.py
@ -0,0 +1,182 @@
+import math
+import struct
+
+import construct as c
+
+clim_header_struct = c.Struct(
+    'clim_header',
+    c.Magic(b'CLIM'),
+    c.Const(c.ULInt16('endianness'), 0xfeff),
+    c.Const(c.ULInt16('header_length'), 0x14),
+    c.ULInt32('version'),
+    c.ULInt32('file_size'),
+    c.ULInt32('blocks_ct'),
+)
+imag_header_struct = c.Struct(
+    'imag_header',
+    c.Magic(b'imag'),
+    c.Const(c.ULInt32('section_length'), 0x10),
+    c.ULInt16('width'),
+    c.ULInt16('height'),
+    c.Enum(
+        c.ULInt32('format'),
+        L8=0,
+        A8=1,
+        LA4=2,
+        LA8=3,
+        HILO8=4,
+        RGB565=5,
+        RGB8=6,
+        RGBA5551=7,
+        RGBA4=8,
+        RGBA8=9,
+        ETC1=10,
+        ETC1A4=11,
+        L4=12,
+        A4=13,
+        #ETC1=19,
+    )
+)
+
+
+COLOR_DECODERS = {}
+
+
+def _register_color_decoder(name, *, bpp, depth):
+    def register(f):
+        COLOR_DECODERS[name] = f, bpp, depth
+        return f
+    return register
+
+
+@_register_color_decoder('RGBA4', bpp=2, depth=4)
+def decode_rgba4(data):
+    # The idea is that every uint16 is a packed rrrrggggbbbbaaaa, but when
+    # written out little-endian this becomes bbbbaaaarrrrgggg and there's just
+    # no pretty way to deal with this
+    for i in range(0, len(data), 2):
+        ba = data[i]
+        rg = data[i + 1]
+        r = (((rg & 0xf0) >> 4) * 255 + 7) // 15
+        g = (((rg & 0x0f) >> 0) * 255 + 7) // 15
+        b = (((ba & 0xf0) >> 4) * 255 + 7) // 15
+        a = (((ba & 0x0f) >> 0) * 255 + 7) // 15
+        yield r, g, b, a
+
+
+@_register_color_decoder('RGBA5551', bpp=2, depth=5)
+def decode_rgba5551(data, *, start=0, count=None):
+    # I am extremely irritated that construct cannot parse this mess for me
+    # rrrrrgggggbbbbba
+    if count is None:
+        end = len(data)
+    else:
+        end = start + count * 2
+
+    for i in range(start, end, 2):
+        datum = data[i] + data[i + 1] * 256
+        r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
+        g = (((datum >> 6) & 0x1f) * 255 + 15) // 31
+        b = (((datum >> 1) & 0x1f) * 255 + 15) // 31
+        a = (datum & 0x1) * 255
+        yield r, g, b, a
+
+
+del _register_color_decoder
+
+
+def apply_palette(palette, data, *, start=0):
+    # TODO i am annoyed that this does a pointless copy, but i assume islice()
+    # has even more overhead...
+    if start != 0:
+        data = data[start:]
+
+    if len(palette) <= 16:
+        # Short palettes allow cramming two pixels into each byte
+        return (
+            palette[idx]
+            for byte in data
+            for idx in (byte >> 4, byte & 0x0f)
+        )
+    else:
+        return map(palette.__getitem__, data)
+
+
+def untile_pixels(raw_pixels, width, height):
+    """Unscramble pixels into plain old rows.
+
+    The pixels are arranged in 8×8 tiles, and each tile is a third-
+    iteration Z-order curve.
+
+    Taken from: https://github.com/Zhorken/pokemon-x-y-icons/
+    """
+
+    # Images are stored padded to powers of two
+    stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
+    stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
+    num_pixels = stored_width * stored_height
+    tile_width = stored_width // 8
+
+    pixels = [
+        [None for x in range(width)]
+        for y in range(height)
+    ]
+
+    for n, pixel in enumerate(raw_pixels):
+        if n >= num_pixels:
+            break
+
+        # Find the coordinates of the top-left corner of the current tile.
+        # n.b. The image is eight tiles wide, and each tile is 8×8 pixels.
+        tile_num = n // 64
+        tile_y = tile_num // tile_width * 8
+        tile_x = tile_num % tile_width * 8
+
+        # Determine the pixel's coordinates within the tile
+        # http://en.wikipedia.org/wiki/Z-order_curve#Coordinate_values
+        within_tile = n % 64
+
+        sub_x = (
+            (within_tile & 0b000001) |
+            (within_tile & 0b000100) >> 1 |
+            (within_tile & 0b010000) >> 2
+        )
+        sub_y = (
+            (within_tile & 0b000010) >> 1 |
+            (within_tile & 0b001000) >> 2 |
+            (within_tile & 0b100000) >> 3
+        )
+
+        # Add up the pixel's coordinates within the whole image
+        x = tile_x + sub_x
+        y = tile_y + sub_y
+
+        if x < width and y < height:
+            pixels[y][x] = pixel
+
+    return pixels
+
+
+def decode_clim(data):
+    imag_header = imag_header_struct.parse(data[-20:])
+    if imag_header.format not in COLOR_DECODERS:
+        raise ValueError(
+            "don't know how to decode {} pixels".format(imag_header.format))
+    color_decoder, color_bpp, color_depth = COLOR_DECODERS[imag_header.format]
+
+    mode, = struct.unpack_from('<H', data, 0)
+    if mode == 2:
+        # Paletted
+        palette_length, = struct.unpack_from('<H', data, 2)
+        palette = list(color_decoder(data, start=4, count=palette_length))
+        data_start = 4 + palette_length * color_bpp
+        scrambled_pixels = apply_palette(palette, data[data_start:])
+    else:
+        scrambled_pixels = color_decoder(data)
+
+    pixels = untile_pixels(
+        scrambled_pixels,
+        imag_header.width,
+        imag_header.height,
+    )
+    return imag_header.width, imag_header.height, color_depth, pixels
--- a/pokedex/extract/lib/garc.py
+++ b/pokedex/extract/lib/garc.py
@ -0,0 +1,307 @@
+"""Support for reading the GARC generic container format used in the 3DS
+filesystem.
+
+Based on code by Zhorken: https://github.com/Zhorken/pokemon-x-y-icons
+and Kaphotics: https://github.com/kwsch/GARCTool
+"""
+from io import BytesIO
+from pathlib import Path
+import struct
+import sys
+
+import construct as c
+
+from . import lzss3
+from .base import _ContainerFile, Substream
+from .pc import PokemonContainerFile
+
+
+def count_bits(n):
+    c = 0
+    while n:
+        c += n & 1
+        n >>= 1
+    return c
+
+
+garc_header_struct = c.Struct(
+    'garc_header',
+    c.Magic(b'CRAG'),
+    c.Const(c.ULInt32('header_size'), 0x1c),
+    c.Const(c.ULInt16('byte_order'), 0xfeff),
+    c.Const(c.ULInt16('mystery1'), 0x0400),
+    c.Const(c.ULInt32('chunks_ct'), 4),
+    c.ULInt32('data_offset'),
+    c.ULInt32('garc_length'),
+    c.ULInt32('last_length'),
+)
+fato_header_struct = c.Struct(
+    'fato_header',
+    c.Magic(b'OTAF'),
+    c.ULInt32('header_size'),
+    c.ULInt16('count'),
+    c.Const(c.ULInt16('padding'), 0xffff),
+    c.Array(
+        lambda ctx: ctx.count,
+        c.ULInt32('fatb_offsets'),
+    ),
+)
+fatb_header_struct = c.Struct(
+    'fatb_header',
+    c.Magic(b'BTAF'),
+    c.ULInt32('fatb_length'),
+    c.ULInt32('count'),
+)
+
+
+class GARCFile(_ContainerFile):
+    def __init__(self, stream):
+        self.stream = stream = Substream(stream)
+
+        garc_header = garc_header_struct.parse_stream(self.stream)
+        # FATO (file allocation table...  offsets?)
+        fato_header = fato_header_struct.parse_stream(self.stream)
+        # FATB (file allocation table)
+        fatb_header = fatb_header_struct.parse_stream(self.stream)
+
+        fatb_start = garc_header.header_size + fato_header.header_size
+        assert stream.tell() == fatb_start + 12
+
+        self.slices = []
+        for i, offset in enumerate(fato_header.fatb_offsets):
+            stream.seek(fatb_start + offset + 12)
+
+            slices = []
+            bits, = struct.unpack('<L', stream.read(4))
+            while bits:
+                if bits & 1:
+                    start, end, length = struct.unpack('<3L', stream.read(12))
+                    assert end - 4 < start + length <= end
+                    slices.append((garc_header.data_offset + start, length))
+                bits >>= 1
+
+            self.slices.append(GARCEntry(stream, slices))
+
+        # FIMB
+        stream.seek(fatb_start + fatb_header.fatb_length)
+        magic, fimb_header_length, fimb_length = struct.unpack(
+            '<4s2L', stream.read(12))
+        assert magic == b'BMIF'
+        assert fimb_header_length == 0xC
+
+
+class GARCEntry(object):
+    def __init__(self, stream, slices):
+        self.stream = stream
+        self.slices = slices
+
+    def __getitem__(self, i):
+        start, length = self.slices[i]
+        ss = self.stream.slice(start, length)
+        if ss.peek(1) in [b'\x10', b'\x11']:
+            # XXX this sucks but there's no real way to know for sure whether
+            # data is compressed or not.  maybe just bake this into the caller
+            # and let them deal with it, same way we do with text decoding?
+            # TODO it would be nice if this could be done lazily for 'inspect'
+            # purposes, since the first four bytes are enough to tell you the
+            # size
+            try:
+                data = lzss3.decompress_bytes(ss.read())
+            except Exception:
+                ss.seek(0)
+            else:
+                return Substream(BytesIO(data))
+        return ss
+
+    def __len__(self):
+        return len(self.slices)
+
+
+XY_CHAR_MAP = {
+    0x307f: 0x202f,  # nbsp
+    0xe08d: 0x2026,  # ellipsis
+    0xe08e: 0x2642,  # female sign
+    0xe08f: 0x2640,  # male sign
+}
+
+XY_VAR_NAMES = {
+    0xff00: "COLOR",
+    0x0100: "TRNAME",
+    0x0101: "PKNAME",
+    0x0102: "PKNICK",
+    0x0103: "TYPE",
+    0x0105: "LOCATION",
+    0x0106: "ABILITY",
+    0x0107: "MOVE",
+    0x0108: "ITEM1",
+    0x0109: "ITEM2",
+    0x010a: "sTRBAG",
+    0x010b: "BOX",
+    0x010d: "EVSTAT",
+    0x0110: "OPOWER",
+    0x0127: "RIBBON",
+    0x0134: "MIINAME",
+    0x013e: "WEATHER",
+    0x0189: "TRNICK",
+    0x018a: "1stchrTR",
+    0x018b: "SHOUTOUT",
+    0x018e: "BERRY",
+    0x018f: "REMFEEL",
+    0x0190: "REMQUAL",
+    0x0191: "WEBSITE",
+    0x019c: "CHOICECOS",
+    0x01a1: "GSYNCID",
+    0x0192: "PRVIDSAY",
+    0x0193: "BTLTEST",
+    0x0195: "GENLOC",
+    0x0199: "CHOICEFOOD",
+    0x019a: "HOTELITEM",
+    0x019b: "TAXISTOP",
+    0x019f: "MAISTITLE",
+    0x1000: "ITEMPLUR0",
+    0x1001: "ITEMPLUR1",
+    0x1100: "GENDBR",
+    0x1101: "NUMBRNCH",
+    0x1302: "iCOLOR2",
+    0x1303: "iCOLOR3",
+    0x0200: "NUM1",
+    0x0201: "NUM2",
+    0x0202: "NUM3",
+    0x0203: "NUM4",
+    0x0204: "NUM5",
+    0x0205: "NUM6",
+    0x0206: "NUM7",
+    0x0207: "NUM8",
+    0x0208: "NUM9",
+}
+
+
+def _xy_inner_keygen(key):
+    while True:
+        yield key
+        key = ((key << 3) | (key >> 13)) & 0xffff
+
+
+def _xy_outer_keygen():
+    key = 0x7c89
+    while True:
+        yield _xy_inner_keygen(key)
+        key = (key + 0x2983) & 0xffff
+
+
+def decrypt_xy_text(data):
+    text_sections, lines, length, initial_key, section_data = struct.unpack_from(
+        '<HHLLl', data)
+
+    outer_keygen = _xy_outer_keygen()
+    ret = []
+
+    for i in range(lines):
+        keygen = next(outer_keygen)
+        s = []
+        offset, length = struct.unpack_from('<lh', data, i * 8 + section_data + 4)
+        offset += section_data
+        start = offset
+        characters = []
+        for ech in struct.unpack_from("<{}H".format(length), data, offset):
+            characters.append(ech ^ next(keygen))
+
+        chiter = iter(characters)
+        for c in chiter:
+            if c == 0:
+                break
+            elif c == 0x10:
+                # Goofy variable thing
+                length = next(chiter)
+                typ = next(chiter)
+                if typ == 0xbe00:
+                    # Pause, then scroll
+                    s.append('\r')
+                elif typ == 0xbe01:
+                    # Pause, then clear screen
+                    s.append('\f')
+                elif typ == 0xbe02:
+                    # Pause for some amount of time?
+                    s.append("{{pause:{}}}".format(next(chiter)))
+                elif typ == 0xbdff:
+                    # Empty text line?  Includes line number, maybe for finding unused lines?
+                    s.append("{{blank:{}}}".format(next(chiter)))
+                else:
+                    s.append("{{{}:{}}}".format(
+                        XY_VAR_NAMES.get(typ, "{:04x}".format(typ)),
+                        ','.join(str(next(chiter)) for _ in range(length - 1)),
+                    ))
+            else:
+                s.append(chr(XY_CHAR_MAP.get(c, c)))
+
+        ret.append(''.join(s))
+
+    return ret
+
+
+def main(args):
+    parser = make_arg_parser()
+    args = parser.parse_args(args)
+    args.cb(args)
+
+
+def do_inspect(args):
+    with open(args.path, 'rb') as f:
+        garc = GARCFile(f)
+        for i, topfile in enumerate(garc):
+            print("File #{}, {} entr{}".format(
+                i, len(topfile), 'y' if len(topfile) == 1 else 'ies'))
+            for j, subfile in enumerate(topfile):
+                print('   ', j, len(subfile), end='')
+                if subfile.peek(2) == b'PC':
+                    print(" -- appears to be a PC file (generic container)")
+                    pcfile = PokemonContainerFile(subfile)
+                    for k, entry in enumerate(pcfile):
+                        print('       ', repr(entry.read(50)))
+                else:
+                    print('', repr(subfile.read(50)))
+
+
+def do_extract(args):
+    with open(args.path, 'rb') as f:
+        garc = GARCFile(f)
+        # TODO shouldn't path really be a directory, so you can mass-extract everything?  do i want to do that ever?
+        # TODO actually respect mode, fileno, entryno
+        for i, topfile in enumerate(garc):
+            # TODO i guess this should be a list, or??
+            if args.fileno is not all and args.fileno != i:
+                continue
+            for j, subfile in enumerate(topfile):
+                # TODO auto-detect extension, maybe?  depending on mode?
+                outfile = Path("{}-{}-{}".format(args.out, i, j))
+                with outfile.open('wb') as g:
+                    # TODO should use copyfileobj
+                    g.write(subfile.read())
+                print("wrote", outfile)
+
+
+def make_arg_parser():
+    from argparse import ArgumentParser
+    p = ArgumentParser()
+    sp = p.add_subparsers(metavar='command')
+
+    inspect_p = sp.add_parser('inspect', help='examine a particular file')
+    inspect_p.set_defaults(cb=do_inspect)
+    inspect_p.add_argument('path', help='relative path to a game file')
+    inspect_p.add_argument('mode', nargs='?', default='shorthex')
+    inspect_p.add_argument('fileno', nargs='?', default=all)
+    inspect_p.add_argument('entryno', nargs='?', default=all)
+
+    extract_p = sp.add_parser('extract', help='extract contents of a file')
+    extract_p.set_defaults(cb=do_extract)
+    extract_p.add_argument('path', help='relative path to a game file')
+    extract_p.add_argument('out', help='filename to use for extraction')
+    extract_p.add_argument('mode', nargs='?', default='raw')
+    extract_p.add_argument('fileno', nargs='?', default=all)
+    extract_p.add_argument('entryno', nargs='?', default=all)
+
+    return p
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/pokedex/extract/lib/lzss3.py
+++ b/pokedex/extract/lib/lzss3.py
@ -0,0 +1,287 @@
+"""Support for the LZSS compression format.
+
+Taken from magical's nlzss project: https://github.com/magical/nlzss
+"""
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import sys
+from sys import stdin, stderr, exit
+from os import SEEK_SET, SEEK_CUR, SEEK_END
+from errno import EPIPE
+from struct import pack, unpack
+
+
+__all__ = ('decompress', 'decompress_file', 'decompress_bytes',
+           'decompress_overlay', 'DecompressionError')
+
+
+class DecompressionError(ValueError):
+    pass
+
+
+def bits(byte):
+    return ((byte >> 7) & 1,
+            (byte >> 6) & 1,
+            (byte >> 5) & 1,
+            (byte >> 4) & 1,
+            (byte >> 3) & 1,
+            (byte >> 2) & 1,
+            (byte >> 1) & 1,
+            (byte) & 1)
+
+
+def decompress_raw_lzss10(indata, decompressed_size, _overlay=False):
+    """Decompress LZSS-compressed bytes. Returns a bytearray."""
+    data = bytearray()
+
+    it = iter(indata)
+
+    if _overlay:
+        disp_extra = 3
+    else:
+        disp_extra = 1
+
+    def writebyte(b):
+        data.append(b)
+
+    def readbyte():
+        return next(it)
+
+    def readshort():
+        # big-endian
+        a = next(it)
+        b = next(it)
+        return (a << 8) | b
+
+    def copybyte():
+        data.append(next(it))
+
+    while len(data) < decompressed_size:
+        b = readbyte()
+        flags = bits(b)
+        for flag in flags:
+            if flag == 0:
+                copybyte()
+            elif flag == 1:
+                sh = readshort()
+                count = (sh >> 0xc) + 3
+                disp = (sh & 0xfff) + disp_extra
+
+                for _ in range(count):
+                    writebyte(data[-disp])
+            else:
+                raise ValueError(flag)
+
+            if decompressed_size <= len(data):
+                break
+
+    if len(data) != decompressed_size:
+        raise DecompressionError(
+            "decompressed size does not match the expected size")
+
+    return data
+
+
+def decompress_raw_lzss11(indata, decompressed_size):
+    """Decompress LZSS-compressed bytes. Returns a bytearray."""
+    data = bytearray()
+
+    it = iter(indata)
+
+    def writebyte(b):
+        data.append(b)
+
+    def readbyte():
+        return next(it)
+
+    def copybyte():
+        data.append(next(it))
+
+    while len(data) < decompressed_size:
+        b = readbyte()
+        flags = bits(b)
+        for flag in flags:
+            if flag == 0:
+                copybyte()
+            elif flag == 1:
+                b = readbyte()
+                indicator = b >> 4
+
+                if indicator == 0:
+                    # 8 bit count, 12 bit disp
+                    # indicator is 0, don't need to mask b
+                    count = (b << 4)
+                    b = readbyte()
+                    count += b >> 4
+                    count += 0x11
+                elif indicator == 1:
+                    # 16 bit count, 12 bit disp
+                    count = ((b & 0xf) << 12) + (readbyte() << 4)
+                    b = readbyte()
+                    count += b >> 4
+                    count += 0x111
+                else:
+                    # indicator is count (4 bits), 12 bit disp
+                    count = indicator
+                    count += 1
+
+                disp = ((b & 0xf) << 8) + readbyte()
+                disp += 1
+
+                try:
+                    for _ in range(count):
+                        writebyte(data[-disp])
+                except IndexError:
+                    raise Exception(count, disp, len(data), sum(1 for x in it))
+            else:
+                raise ValueError(flag)
+
+            if decompressed_size <= len(data):
+                break
+
+    if len(data) != decompressed_size:
+        raise DecompressionError(
+            "decompressed size does not match the expected size")
+
+    return data
+
+
+def decompress_overlay(f, out):
+    # the compression header is at the end of the file
+    f.seek(-8, SEEK_END)
+    header = f.read(8)
+
+    # decompression goes backwards.
+    # end < here < start
+
+    # end_delta == here - decompression end address
+    # start_delta == decompression start address - here
+    end_delta, start_delta = unpack("<LL", header)
+
+    filelen = f.tell()
+
+    padding = end_delta >> 0x18
+    end_delta &= 0xFFFFFF
+    decompressed_size = start_delta + end_delta
+
+    f.seek(-end_delta, SEEK_END)
+
+    data = bytearray()
+    data.extend(f.read(end_delta - padding))
+    data.reverse()
+
+    uncompressed_data = decompress_raw_lzss10(
+        data, decompressed_size, _overlay=True)
+    uncompressed_data.reverse()
+
+    # first we write up to the portion of the file which was "overwritten" by
+    # the decompressed data, then the decompressed data itself.
+    # i wonder if it's possible for decompression to overtake the compressed
+    # data, so that the decompression code is reading its own output...
+    f.seek(0, SEEK_SET)
+    out.write(f.read(filelen - end_delta))
+    out.write(uncompressed_data)
+
+
+def decompress(obj):
+    """Decompress LZSS-compressed bytes or a file-like object.
+
+    Shells out to decompress_file() or decompress_bytes() depending on
+    whether or not the passed-in object has a 'read' attribute or not.
+
+    Returns a bytearray."""
+    if hasattr(obj, 'read'):
+        return decompress_file(obj)
+    else:
+        return decompress_bytes(obj)
+
+
+def decompress_bytes(data):
+    """Decompress LZSS-compressed bytes. Returns a bytearray."""
+    header = data[:4]
+    if header[0] == 0x10:
+        decompress_raw = decompress_raw_lzss10
+    elif header[0] == 0x11:
+        decompress_raw = decompress_raw_lzss11
+    else:
+        raise DecompressionError("not as lzss-compressed file")
+
+    decompressed_size, = unpack("<L", header[1:] + b'\x00')
+
+    data = data[4:]
+    return decompress_raw(data, decompressed_size)
+
+
+def decompress_file(f):
+    """Decompress an LZSS-compressed file. Returns a bytearray.
+
+    This isn't any more efficient than decompress_bytes, as it reads
+    the entire file into memory. It is offered as a convenience.
+    """
+    header = f.read(4)
+    if header[0] == 0x10:
+        decompress_raw = decompress_raw_lzss10
+    elif header[0] == 0x11:
+        decompress_raw = decompress_raw_lzss11
+    else:
+        raise DecompressionError("not as lzss-compressed file")
+
+    decompressed_size, = unpack("<L", header[1:] + b'\x00')
+
+    data = f.read()
+    return decompress_raw(data, decompressed_size)
+
+
+def main(args=None):
+    if args is None:
+        args = sys.argv[1:]
+
+    if '--overlay' in args:
+        args.remove('--overlay')
+        overlay = True
+    else:
+        overlay = False
+
+    if len(args) < 1 or args[0] == '-':
+        if overlay:
+            print("Can't decompress overlays from stdin", file=stderr)
+            return 2
+
+        if hasattr(stdin, 'detach'):
+            f = stdin.detach()
+        else:
+            f = stdin
+    else:
+        try:
+            f = open(args[0], "rb")
+        except IOError as e:
+            print(e, file=stderr)
+            return 2
+
+    stdout = sys.stdout
+    if hasattr(stdout, 'detach'):
+        # grab the underlying binary stream
+        stdout = stdout.detach()
+
+    try:
+        if overlay:
+            decompress_overlay(f, stdout)
+        else:
+            stdout.write(decompress_file(f))
+    except IOError as e:
+        if e.errno == EPIPE:
+            # don't complain about a broken pipe
+            pass
+        else:
+            raise
+    except (DecompressionError,) as e:
+        print(e, file=stderr)
+        return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/pokedex/extract/lib/pc.py
+++ b/pokedex/extract/lib/pc.py
@ -0,0 +1,19 @@
+"""Allegedly stands for 'Pokémon Container'.  Completely generic, dead-simple
+container format.
+"""
+from .base import _ContainerFile, Substream
+
+
+class PokemonContainerFile(_ContainerFile):
+    magic = b'PC'
+
+    def __init__(self, stream):
+        self.stream = stream = Substream(stream)
+
+        magic, entry_ct = stream.unpack('<2sH')
+        assert magic == b'PC'
+
+        self.slices = []
+        for _ in range(entry_ct):
+            start, end = stream.unpack('<LL')
+            self.slices.append(self.stream.slice(start, end - start))
--- a/pokedex/extract/lib/text.py
+++ b/pokedex/extract/lib/text.py
@ -0,0 +1,115 @@
+def merge_japanese_texts(kanji, kana, html=False):
+    """Combine a (presuambly equivalent) pair of kanji and kana strings into a
+    single string of kanji with furigana.
+
+    If `html` is truthy, the return value will contain HTML ruby tags;
+    otherwise it will use the Unicode "interlinear annotation" characters.
+
+    This relies on the Needleman–Wunsch algorithm for sequence alignment:
+    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
+    """
+    # TODO maybe this is faster, but then -1 doesn't work
+    #table = [
+    #    [None for _ in range(len(kana))]
+    #    for _ in range(len(kanji))
+    #]
+    table = {}
+    # continue left, continue up, are the characters equivalent, score for this
+    # cell
+    table[-1, -1] = False, False, True, 0
+
+    isjunk = {}
+    for ch in kanji + kana:
+        isjunk[ch] = ch.isspace() or ch in '。␤'
+
+    # initialize, TODO, something about scoring compared to a gap
+    for i, ch in enumerate(kanji):
+        table[i, -1] = True, False, False, -1 - i
+    for i, ch in enumerate(kana):
+        table[-1, i] = False, True, False, -1 - i
+    for a, ach in enumerate(kanji):
+        for b, bch in enumerate(kana):
+            options = []
+            # Continue diagonally means two characters together, either a match
+            # or a mismatch
+            if ach == bch or (isjunk[ach] and isjunk[bch]):
+                equiv = True
+                score = 1
+            else:
+                equiv = False
+                score = -1
+            options.append((True, True, equiv, table[a - 1, b - 1][2] + score))
+
+            # Continue from or side means an indel...  -1
+            if isjunk[ach]:
+                score = 0
+            else:
+                score = -1
+            options.append((True, False, equiv, table[a - 1, b][2] + score))
+            if isjunk[bch]:
+                score = 0
+            else:
+                score = -1
+            options.append((False, True, equiv, table[a, b - 1][2] + score))
+
+            # Strictly speaking, in the case of a tie, all of the "best"
+            # choices are supposed to be preserved.  But we should never have a
+            # tie, and we have an arbitrary choice of which to use in the end
+            # anyway, so screw it.
+            table[a, b] = max(options, key=lambda opt: opt[2])
+
+    if html:
+        ruby_format = "<ruby><rb>{}</rb><rt>{}</rt></ruby>"
+    else:
+        ruby_format = "\ufff9{}\ufffa{}\ufffb"
+
+    def add_mismatches(mismatch_a, mismatch_b, final):
+        # Need to pop out any extra junk characters at the beginning or end --
+        # but only the kanji ones stay, since kanji is "canonical"
+        while mismatch_a and isjunk[mismatch_a[0]]:
+            final.append(mismatch_a.pop(0))
+        while mismatch_b and isjunk[mismatch_b[0]]:
+            mismatch_b.pop(0)
+        endjunk = []
+        while mismatch_a and isjunk[mismatch_a[-1]]:
+            endjunk.append(mismatch_a.pop())
+        while mismatch_b and isjunk[mismatch_b[-1]]:
+            mismatch_b.pop()
+        final.append(ruby_format.format(
+            ''.join(reversed(mismatch_a)),
+            ''.join(reversed(mismatch_b)),
+        ))
+        final.extend(endjunk)
+        del mismatch_a[:]
+        del mismatch_b[:]
+
+    final = []
+    mismatch_a = []
+    mismatch_b = []
+    a = len(kanji) - 1
+    b = len(kana) - 1
+    while True:
+        walk_left, walk_up, equiv, score = table[a, b]
+        if walk_left and walk_up:
+            if equiv:
+                if mismatch_a or mismatch_b:
+                    add_mismatches(mismatch_a, mismatch_b, final)
+                final.append(kanji[a])
+            else:
+                mismatch_a.append(kanji[a])
+                mismatch_b.append(kana[b])
+            a -= 1
+            b -= 1
+        elif walk_left:
+            mismatch_a.append(kanji[a])
+            a -= 1
+        elif walk_up:
+            mismatch_b.append(kana[b])
+            b -= 1
+        else:
+            break
+
+    if mismatch_a or mismatch_b:
+        add_mismatches(mismatch_a, mismatch_b, final)
+
+    return ''.join(reversed(final))
--- a/pokedex/extract/oras.py
+++ b/pokedex/extract/oras.py
@ -0,0 +1,844 @@
+"""Dumps data from Omega Ruby and Alpha Sapphire.
+
+Filesystem reference: http://www.projectpokemon.org/wiki/ORAS_File_System
+"""
+import argparse
+from collections import OrderedDict
+from contextlib import contextmanager
+import itertools
+import math
+from pathlib import Path
+import shutil
+import struct
+
+from construct import Array, BitField, Bitwise, Magic, OptionalGreedyRange, Padding, Pointer, Struct, SLInt8, SLInt16, ULInt8, ULInt16, ULInt32
+import yaml
+
+from .lib.garc import GARCFile, decrypt_xy_text
+from .lib.text import merge_japanese_texts
+
+
+# TODO fix some hardcoding in here
+# TODO finish converting garc parsing to use construct, if possible, i think (i would not miss substream)
+# way way more sprite work in here...
+
+
+CANON_LANGUAGES = ('ja', 'en', 'fr', 'it', 'de', 'es', 'ko')
+ORAS_SCRIPT_FILES = {
+    'ja-kana': 'rom/a/0/7/1',
+    'ja-kanji': 'rom/a/0/7/2',
+    'en': 'rom/a/0/7/3',
+    'fr': 'rom/a/0/7/4',
+    'it': 'rom/a/0/7/5',
+    'de': 'rom/a/0/7/6',
+    'es': 'rom/a/0/7/7',
+    'ko': 'rom/a/0/7/8',
+}
+ORAS_SCRIPT_ENTRIES = {
+    'form-names': 5,
+    # TODO these might be backwards, i'm just guessing
+    'species-flavor-alpha-sapphire': 6,
+    'species-flavor-omega-ruby': 7,
+    'move-contest-flavor': 13,
+    'move-names': 14,
+    # Note: table 15 is also a list of move names, but with a few at the end
+    # missing?  XY leftovers?
+    'move-flavor': 16,
+    'type-names': 18,
+    'ability-flavor': 36,
+    'ability-names': 37,
+    'nature-names': 51,
+    'species-names': 98,
+}
+# The first element in each list is the name of the BASE form -- if it's not
+# None, the base form will be saved under two filenames
+ORAS_EXTRA_SPRITE_NAMES = {
+    # Cosplay Pikachu
+    25: (None, 'rockstar', 'belle', 'popstar', 'phd', 'libre', 'cosplay'),
+    # Unown
+    201: tuple('abcdefghijklmnopqrstuvwxyz') + ('exclamation', 'question'),
+    # Castform
+    351: (None, 'sunny', 'rainy', 'snowy'),
+    # Kyogre and Groudon
+    382: (None, 'primal',),
+    383: (None, 'primal',),
+    # Deoxys
+    386: ('normal', 'attack', 'defense', 'speed'),
+    # Burmy and Wormadam
+    412: ('plant', 'sandy', 'trash'),
+    413: ('plant', 'sandy', 'trash'),
+    # Cherrim
+    421: ('overcast', 'sunshine',),
+    # Shellos and Gastrodon
+    422: ('west', 'east',),
+    423: ('west', 'east',),
+    # Rotom
+    479: (None, 'heat', 'wash', 'frost', 'fan', 'mow'),
+    # Giratina
+    487: ('altered', 'origin',),
+    # Shaymin
+    492: ('land', 'sky',),
+    # Arceus
+    493: (
+        'normal', 'fighting', 'flying', 'poison', 'ground', 'rock', 'bug',
+        'ghost', 'steel', 'fire', 'water', 'grass', 'electric', 'psychic',
+        'ice', 'dragon', 'dark', 'fairy',
+    ),
+    # Basculin
+    550: ('red-striped', 'blue-striped',),
+    # Darmanitan
+    555: ('standard', 'zen',),
+    # Deerling and Sawsbuck
+    585: ('sprint', 'summer', 'autumn', 'winter'),
+    586: ('sprint', 'summer', 'autumn', 'winter'),
+    # Tornadus, Thundurus, and Landorus
+    641: ('incarnate', 'therian'),
+    642: ('incarnate', 'therian'),
+    645: ('incarnate', 'therian'),
+    # Kyurem
+    646: (None, 'white', 'black'),
+    # Keldeo
+    647: ('ordinary', 'resolute'),
+    # Meloetta
+    648: ('aria', 'pirouette'),
+    # Genesect
+    649: (None, 'douse', 'shock', 'burn', 'chill'),
+    # Vivillon
+    666: (
+        'icy-snow', 'polar', 'tundra', 'continental', 'garden', 'elegant',
+        'meadow', 'modern', 'marine', 'archipelago', 'high-plains',
+        'sandstorm', 'river', 'monsoon', 'savanna', 'sun', 'ocean', 'jungle',
+        'fancy', 'poke-ball',
+    ),
+    # Flabébé/Floette/Florges
+    669: ('red', 'yellow', 'orange', 'blue', 'white'),
+    670: ('red', 'yellow', 'orange', 'blue', 'white', 'eternal'),
+    671: ('red', 'yellow', 'orange', 'blue', 'white'),
+    # Furfrou
+    676: (
+        'natural', 'heart', 'star', 'diamond', 'debutante', 'matron', 'dandy',
+        'la-reine', 'kabuki', 'pharaoh',
+    ),
+    # Meowstic
+    #678: [male, female]
+    # Aegislash
+    681: ('shield', 'blade'),
+    # Pumpkaboo/Gourgeist
+    710: ('average', 'small', 'large', 'super'),
+    711: ('average', 'small', 'large', 'super'),
+    # Xerneas
+    716: ('neutral', 'active'),
+    # Hoopa
+    720: ('confined', 'unbound'),
+}
+
+
+pokemon_struct = Struct(
+    'pokemon',
+    ULInt8('stat_hp'),
+    ULInt8('stat_atk'),
+    ULInt8('stat_def'),
+    ULInt8('stat_speed'),
+    ULInt8('stat_spatk'),
+    ULInt8('stat_spdef'),
+    ULInt8('type1'),
+    ULInt8('type2'),
+    ULInt8('catch_rate'),
+    ULInt8('stage'),
+    ULInt16('effort'),
+    ULInt16('held_item1'),
+    ULInt16('held_item2'),
+    ULInt16('held_item3'),  # dark grass from bw, unused in oras?
+    ULInt8('gender_rate'),
+    ULInt8('steps_to_hatch'),
+    ULInt8('base_happiness'),
+    ULInt8('exp_curve'),
+    ULInt8('egg_group1'),
+    ULInt8('egg_group2'),
+    ULInt8('ability1'),
+    ULInt8('ability2'),
+    ULInt8('ability_dream'),
+    ULInt8('safari_escape'),
+    ULInt16('form_species_start'),
+    ULInt16('form_sprite_start'),
+    ULInt8('form_count'),
+    ULInt8('color'),
+    ULInt16('base_exp'),
+    ULInt16('height'),
+    ULInt16('weight'),
+    Bitwise(
+        BitField('machines', 14 * 8, swapped=True),
+    ),
+    Padding(2),
+    ULInt32('tutors'),
+    ULInt16('mystery1'),
+    ULInt16('mystery2'),
+    ULInt32('bp_tutors1'),
+    ULInt32('bp_tutors2'),
+    ULInt32('bp_tutors3'),
+    ULInt32('bp_tutors4'),
+)
+
+pokemon_mega_evolutions_struct = Array(
+    3,
+    Struct(
+        'pokemon_mega_evolutions',
+        ULInt16('number'),
+        ULInt16('mode'),
+        ULInt16('mega_stone_itemid'),
+        Padding(2),
+    )
+)
+
+egg_moves_struct = Struct(
+    'egg_moves',
+    ULInt16('count'),
+    Array(
+        lambda ctx: ctx.count,
+        ULInt16('moveids'),
+    ),
+)
+
+level_up_moves_struct = OptionalGreedyRange(
+    Struct(
+        'level_up_pair',
+        SLInt16('moveid'),
+        SLInt16('level'),
+    ),
+)
+
+move_struct = Struct(
+    'move',
+    ULInt8('type'),
+    ULInt8('category'),
+    ULInt8('damage_class'),
+    ULInt8('power'),
+    ULInt8('accuracy'),
+    ULInt8('pp'),
+    SLInt8('priority'),
+    ULInt8('min_max_hits'),
+    SLInt16('caused_effect'),
+    ULInt8('effect_chance'),
+    ULInt8('status'),
+    ULInt8('min_turns'),
+    ULInt8('max_turns'),
+    ULInt8('crit_rate'),
+    ULInt8('flinch_chance'),
+    ULInt16('effect'),
+    SLInt8('recoil'),
+    ULInt8('healing'),
+    ULInt8('range'),            # ok
+    Bitwise(
+        BitField('stat_change', 24),
+    ),
+    Bitwise(
+        BitField('stat_amount', 24),
+    ),
+    Bitwise(
+        BitField('stat_chance', 24),
+    ),
+    ULInt8('padding0'),         # ok
+    ULInt8('padding1'),         # ok
+    ULInt16('flags'),
+    ULInt8('padding2'),         # ok
+    ULInt8('extra'),
+)
+move_container_struct = Struct(
+    'move_container',
+    Magic(b'WD'),  # waza...  descriptions?
+    ULInt16('record_ct'),
+    Array(
+        lambda ctx: ctx.record_ct,
+        Struct(
+            'records',
+            ULInt32('offset'),
+            Pointer(lambda ctx: ctx.offset, move_struct),
+        ),
+    ),
+)
+
+pokemon_sprite_struct = Struct(
+    'pokemon_sprite_config',
+    ULInt16('index'),
+    ULInt16('female_index'),
+    ULInt32('form_index_offset'),
+    ULInt32('right_index_offset'),
+    ULInt16('form_count'),
+    ULInt16('right_count'),
+)
+
+# There are 63 tutor move bits in use, but only 60 move tutors -- the moves
+# appear to be largely inherited from B2W2 but these are just not exposed in
+# ORAS
+ORAS_UNUSED_MOVE_TUTORS = {'dark-pulse', 'roost', 'sleep-talk'}
+# Unsure where this is in the binary
+ORAS_NORMAL_MOVE_TUTORS = (
+    'grass-pledge',
+    'fire-pledge',
+    'water-pledge',
+    'frenzy-plant',
+    'blast-burn',
+    'hydro-cannon',
+    'draco-meteor',
+    'dragon-ascent',
+)
+
+
+@contextmanager
+def read_garc(path):
+    with path.open('rb') as f:
+        yield GARCFile(f)
+
+
+# XXX christ lol.  taken from SO.  fodder for camel maybe
+def represent_ordereddict(dumper, data):
+    value = []
+
+    for item_key, item_value in data.items():
+        node_key = dumper.represent_data(item_key)
+        node_value = dumper.represent_data(item_value)
+
+        value.append((node_key, node_value))
+
+    return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)
+yaml.add_representer(OrderedDict, represent_ordereddict)
+
+
+def represent_tuple(dumper, data):
+    return yaml.nodes.SequenceNode(
+        u'tag:yaml.org,2002:seq',
+        [dumper.represent_data(item) for item in data],
+        flow_style=True,
+    )
+yaml.add_representer(tuple, represent_tuple)
+
+
+def dump_to_yaml(data, f):
+    # TODO gonna need a better way to handle flow style
+    yaml.dump(
+        data, f,
+        default_flow_style=False,
+        allow_unicode=True,
+    )
+
+
+def extract_data(root, out):
+    # TODO big conceptual question for the yaml thing: how do we decide how the
+    # identifiers work in the per-version data?  the "global" identifiers are
+    # in theory based on the names from the latest version, and the game dump
+    # scripts shouldn't have to care about what the latest version is
+    # 1. make the canon data not be keyed by identifier (makes it hard to
+    # follow what's going on in flavor text files etc, and unclear how to match
+    # up items across versions)
+    # 2. make each version's data keyed by its own identifiers (makes it hard
+    # to align them all when loading everything, and unclear how to match up
+    # items whose names change across versions)
+    # 3. hardcode a mapping of version+identifier pairs to their current
+    # identifiers, when they changed, which is a little ugly but also solves
+    # all the match-up problems and is what we'd basically have to do anyway
+
+    # -------------------------------------------------------------------------
+    # Names and flavor text
+    texts = {}
+    for lang, fn in ORAS_SCRIPT_FILES.items():
+        texts[lang] = {}
+        with read_garc(root / fn) as garc:
+            for entryname, entryid in ORAS_SCRIPT_ENTRIES.items():
+                entry = garc[entryid][0]
+                texts[lang][entryname] = decrypt_xy_text(entry.read())
+
+    # Japanese text is special!  It's written in both kanji and kana, and we
+    # want to combine them
+    texts['ja'] = {}
+    for entryname in ORAS_SCRIPT_ENTRIES:
+        kanjis = texts['ja-kanji'][entryname]
+        kanas = texts['ja-kana'][entryname]
+        # But not if they're names of things.
+        # (TODO this might not be true in the case of, say, towns?  in which
+        # case, what do we do?  we want to ultimately put these in urls and
+        # whatnot, right, but we don't want furigana there  :S  do we need a
+        # separate "identifier" field /per language/?)
+        if entryname.endswith('names'):
+            assert kanjis == kanas
+            texts['ja'][entryname] = kanjis
+        else:
+            assert len(kanas) == len(kanjis)
+            texts['ja'][entryname] = [
+                merge_japanese_texts(kanji, kana)
+                for (kanji, kana) in zip(kanjis, kanas)
+            ]
+    del texts['ja-kanji']
+    del texts['ja-kana']
+
+    identifiers = {}
+    identifiers['species'] = [
+        # TODO better identifier creation, to be determined later, but surely
+        # want to lose . and '
+        # TODO handling forms here is awkward since the form names are
+        # contained in the personal struct
+        ((species_name or '') + '-' + form_name).lower().replace(' ', '-')
+        for (species_name, form_name) in itertools.zip_longest(
+            texts['en']['species-names'],
+            texts['en']['form-names'],
+        )
+    ]
+    identifiers['move'] = [
+        # TODO better identifier creation, to be determined later, but surely
+        # want to lose . and '
+        name.lower().replace(' ', '-')
+        for name in texts['en']['move-names']
+    ]
+
+    textdir = out / 'script'
+    if not textdir.exists():
+        textdir.mkdir()
+    for lang in CANON_LANGUAGES:
+        with (textdir / (lang + '.yaml')).open('w') as f:
+            # TODO this should use identifiers, not be lists
+            # TODO need to skip slot 0 which is junk
+            dump_to_yaml(texts[lang], f)
+
+    # -------------------------------------------------------------------------
+    # Scrape some useful bits from the binary
+    with (root / 'exe/code.bin').open('rb') as f:
+        # Tutored moves
+        tutor_moves = dict(tutors=ORAS_NORMAL_MOVE_TUTORS)
+        f.seek(0x004960f8)
+        for n in range(1, 5):
+            key = "bp_tutors{}".format(n)
+            moves = tutor_moves[key] = []
+            while True:
+                moveid, = struct.unpack('<H', f.read(2))
+                if moveid >= len(identifiers['move']):
+                    break
+                moves.append(identifiers['move'][moveid])
+
+        # TMs
+        machines = []
+        f.seek(0x004a67ee)
+        machineids = struct.unpack('<107H', f.read(2 * 107))
+        # Order appears to be based on some gen 4 legacy: TMs 1 through 92, HMs
+        # 1 through 6, then the other eight TMs and the last HM.  But the bits
+        # in the Pokémon structs are in the expected order of 1 through 100, 1
+        # through 7
+        machines = [
+            identifiers['move'][moveid]
+            for moveid in
+                machineids[0:92] +
+                machineids[98:106] +
+                machineids[92:98] +
+                machineids[106:]
+        ]
+
+
+    # -------------------------------------------------------------------------
+    # Pokémon structs
+    pokemon_data = []
+    with read_garc(root / 'rom/a/1/9/5') as garc:
+        personals = [subfile[0].read() for subfile in garc]
+    _pokemon_forms = {}  # "real" species id => (base species id, form name id)
+    _next_name_form_id = 723
+    for i, personal in enumerate(personals[:-1]):
+        record = pokemon_struct.parse(personal)
+        # TODO transform to an OD somehow probably
+        pokemon_data.append(record)
+        #print("{:3d} {:15s} {} {:5d} {:5d}".format(
+        #    i,
+        #    identifiers['species'][baseid],
+        #    ('0'*16 + bin(record.mystery1)[2:])[-16:],
+        #    record.mystery2,
+        #    record.stage,
+        #))
+        # TODO some pokemon have sprite starts but no species start, because their sprites vary obv
+        if record.form_count > 1:
+            # The form names appear to be all just jammed at the end in order,
+            # completely unrelated to either of the "start" offsets here
+            for offset in range(record.form_count - 1):
+                #form_name = texts['en']['form-names'][_next_name_form_id]
+
+                if record.form_species_start:
+                    # TODO still no idea how "intangible" forms are being
+                    # handled in the new schema
+                    _pokemon_forms[record.form_species_start + offset] = i, _next_name_form_id
+
+                _next_name_form_id += 1
+
+        if record.form_species_start:
+            for offset in range(record.form_count - 1):
+                # TODO grab the form names argh
+                identifiers['species'][record.form_species_start + offset] = identifiers['species'][i]
+
+    #for i in range(723, 825 + 1):
+    #    base_species_id, form_name_id = _pokemon_forms[i]
+    #    species_name = texts['en']['species-names'][base_species_id]
+    #    form_name = texts['en']['form-names'][form_name_id]
+    #    print(i, species_name, '/', form_name)
+
+    # -------------------------------------------------------------------------
+    # Move stats
+    movesets = OrderedDict()
+    with read_garc(root / 'rom/a/1/8/9') as garc:
+        # Only one subfile
+        data = garc[0][0].read()
+        container = move_container_struct.parse(data)
+        for n, record in enumerate(container.records):
+            m = record.move
+            # TODO with the release of oras all moves have contest types and effects again!  where are they??
+            #print("{:3d} {:20s} | {m.type:3d} {m.power:3d} {m.pp:2d} {m.accuracy:3d} / {m.priority:2d} {m.range:2d} {m.damage_class:1d} / {m.effect:3d} {m.caused_effect:3d} {m.effect_chance:3d}  --  {m.status:3d} {m.min_turns:3d} {m.max_turns:3d} {m.crit_rate:3d} {m.flinch_chance:3d} {m.recoil:4d} {m.healing:3d} / {m.stat_change:06x} {m.stat_amount:06x} {m.stat_chance:06x} / {m.padding0:3d} {m.padding1:3d} {m.flags:04x} {m.padding2:3d} {m.extra:3d}".format(
+            #    n,
+            #    identifiers['move'][n],
+            #    m=record.move,
+            #))
+
+    # Egg moves
+    with read_garc(root / 'rom/a/1/9/0') as garc:
+        for i, subfile in enumerate(garc):
+            ident = identifiers['species'][i]
+            data = subfile[0].read()
+            if not data:
+                continue
+            container = egg_moves_struct.parse(data)
+            moveset = movesets.setdefault(ident, OrderedDict())
+            eggset = moveset['egg'] = []
+            for moveid in container.moveids:
+                eggset.append(identifiers['move'][moveid])
+
+    # Level-up moves
+    with read_garc(root / 'rom/a/1/9/1') as garc:
+        for i, subfile in enumerate(garc):
+            ident = identifiers['species'][i]
+            level_up_moves = subfile[0].read()
+            moveset = movesets.setdefault(ident, OrderedDict())
+            levelset = moveset['level'] = []
+            lastlevel = None
+            order = 1
+            for pair in level_up_moves_struct.parse(level_up_moves):
+                # End is indicated with -1, -1
+                if pair.moveid <= 0:
+                    break
+                levelset.append((
+                    pair.level,
+                    identifiers['move'][pair.moveid],
+                ))
+
+                if pair.level == lastlevel:
+                    order += 1
+                else:
+                    lastlevel = pair.level
+                    order = 1
+
+    # Evolution
+    #with read_garc(root / 'rom/a/1/9/2') as garc:
+    #    for subfile in garc:
+    #        evolution = subfile[0].read()
+    #        print(repr(evolution))
+    # Mega evolution
+    #with read_garc(root / 'rom/a/1/9/3') as garc:
+    #    for subfile in garc:
+    #        evolution = subfile[0].read()
+    #        print(repr(evolution))
+    # TODO what is a/1/9/4?  8 files of 404 bytes each
+    # Baby Pokémon
+    #with read_garc(root / 'rom/a/1/9/6') as garc:
+    #    for subfile in garc:
+    #        baby_pokemon = subfile[0].read()
+    #        print(repr(baby_pokemon))
+    # Item stats
+    #with read_garc(root / 'rom/a/1/9/7') as garc:
+    #    for subfile in garc:
+    #        item_stats = subfile[0].read()
+    #        print(repr(item_stats))
+
+    # Tutor moves (from the personal structs)
+    for i, datum in enumerate(pokemon_data):
+        ident = identifiers['species'][i]
+        moveset = movesets.setdefault(ident, OrderedDict())
+        tutorset = moveset['tutor'] = []
+        for key, tutors in tutor_moves.items():
+            for bit, moveident in enumerate(tutors):
+                if moveident in ORAS_UNUSED_MOVE_TUTORS:
+                    continue
+                if not datum[key] & (1 << bit):
+                    continue
+                tutorset.append(moveident)
+
+        # TMs
+        machineset = moveset['machine'] = []
+        for bit, moveident in enumerate(machines):
+            if not datum['machines'] & (1 << bit):
+                continue
+            machineset.append(moveident)
+
+    with (out / 'movesets.yaml').open('w') as f:
+        dump_to_yaml(movesets, f)
+
+
+def extract_box_sprites(root, out):
+    filenames = {}
+    with (root / 'exe/code.bin').open('rb') as f:
+        # Form configuration, used to put sprites in the right order
+        # NOTE: in x/y the address is 0x0043ea98
+        f.seek(0x0047d650)
+        # TODO need to do a different thing for main sprites
+        # TODO magic number
+        for n in range(722):
+            sprite = pokemon_sprite_struct.parse_stream(f)
+            assert sprite.index not in filenames
+            filenames[sprite.index] = "{}".format(n)
+            if sprite.female_index != sprite.index:
+                assert sprite.female_index not in filenames
+                filenames[sprite.female_index] = "{}-female".format(n)
+            # Note that these addresses are relative to RAM, and the binary is
+            # loaded into RAM starting at 0x100000, so we need to subtract that
+            # to get a file position
+            pos = f.tell()
+            form_indices = ()
+            right_indices = ()
+
+            if sprite.form_index_offset:
+                f.seek(sprite.form_index_offset - 0x100000)
+                form_indices = struct.unpack(
+                    "<{}H".format(sprite.form_count),
+                    f.read(2 * sprite.form_count),
+                )
+                for form, form_idx in enumerate(form_indices):
+                    # Ignore the first form, since it's the default and thus
+                    # covered by `index` already
+                    if form == 0:
+                        continue
+                    if form_idx == sprite.index:
+                        continue
+                    assert form_idx not in filenames
+                    filenames[form_idx] = "{}-form{}".format(n, form)
+
+            if sprite.right_index_offset:
+                f.seek(sprite.right_index_offset - 0x100000)
+                right_indices = struct.unpack(
+                    "<{}H".format(sprite.right_count),
+                    f.read(2 * sprite.right_count),
+                )
+                if sprite.form_count:
+                    assert sprite.right_count == sprite.form_count
+                    for form, (form_idx, right_idx) in enumerate(zip(form_indices, right_indices)):
+                        if form_idx == right_idx:
+                            continue
+                        if form != 0:
+                            suffix = "form{}-right".format(form)
+                        else:
+                            suffix = 'right'
+                        assert right_idx not in filenames
+                        filenames[right_idx] = "{}-{}".format(n, suffix)
+                else:
+                    assert sprite.right_count == 2
+                    assert right_indices[0] == right_indices[1]
+                    if right_indices[0] != sprite.index:
+                        assert right_indices[0] not in filenames
+                        filenames[right_indices[0]] = "{}-right".format(n)
+
+            f.seek(pos)
+
+    pokemon_sprites_dir = out
+    if not pokemon_sprites_dir.exists():
+        pokemon_sprites_dir.mkdir()
+    with read_garc(root / 'rom/a/0/9/1') as garc:
+        from .lib.clim import decode_clim
+        for i, subfile in enumerate(garc):
+            if i == 0:
+                # Dummy blank sprite, not interesting to us
+                continue
+            elif i in filenames:
+                filename = filenames[i] + '.png'
+            elif i == len(garc) - 1:
+                # Very last one is egg
+                filename = 'egg.png'
+            else:
+                # This is a duplicate Entei sprite that's not used
+                assert i in (333,)
+                continue
+
+            data = subfile[0].read()
+            width, height, color_depth, pixels = decode_clim(data)
+            png_writer = png.Writer(
+                width=width,
+                height=height,
+                alpha=True,
+            )
+
+            # this library is so fucking stupid
+            # TODO strictly speaking we could just write out a paletted PNG directly
+            # TODO add sBIT chunk indicating original bit depth
+            with (pokemon_sprites_dir / filename).open('wb') as f:
+                png_writer.write(f, (itertools.chain(*row) for row in pixels))
+
+
+def extract_dex_sprites(root, out):
+    # Some Pokémon have dex sprites for their forms, too, and they're all
+    # clustered together, so we have to do a little work to fix the numbering.
+    # Luckily the dex sprites are in the same order as the models
+    # (unsurprising, as they're just model renders), which also tells us what
+    # Pokémon have female forms.  The mega evolution map tells us which forms
+    # are megas, and the rest are listed manually above as
+    # ORAS_EXTRA_SPRITE_NAMES.
+
+    # Grab the list of megas first
+    num_megas = {}  # pokemonid => number of mega evos
+    with read_garc(root / 'rom/a/1/9/3') as garc:
+        for pokemonid, subfile in enumerate(garc):
+            mega_evos = pokemon_mega_evolutions_struct.parse_stream(subfile[0])
+            num_megas[pokemonid] = max(
+                mega_evo.number for mega_evo in mega_evos)
+
+    # Then construct filenames, using num_megas plus information from the model
+    # index
+    filenames = {}  # model/sprite number => filename, sans extension
+    duplicate_filenames = []  # pairs of (copy from, copy to)
+    with read_garc(root / 'rom/a/0/0/8') as garc:
+        f = garc[0][0]
+        # TODO magic number
+        for n in range(721):
+            # Unlike /virtually everywhere else/, Pokémon are zero-indexed here
+            pokemonid = n + 1
+            # Index of the first model (also zero-indexed), how many models the
+            # Pokémon has, and some flags
+            start, count, flags = struct.unpack('<HBB', f.read(4))
+            model_num = start + 1
+            # For some asinine reason, Xerneas is counted as two separate
+            # Pokémon in the dex sprites but not the models, so we have to
+            # shift everything after it back by 1
+            if pokemonid == 716:
+                count = 2
+            elif pokemonid >= 717:
+                model_num += 1
+
+            filenames[model_num] = str(pokemonid)
+            form_count = count - 1  # discount "base" form
+            total_model_count = model_num + count - 1
+
+            # Some "forms" have no real default, so we save the sprite both as
+            # nnn.png and nnn-form.png, to guarantee that nnn.png always exists
+            if pokemonid in ORAS_EXTRA_SPRITE_NAMES:
+                if ORAS_EXTRA_SPRITE_NAMES[pokemonid][0] is not None:
+                    duplicate_filenames.append((
+                        str(pokemonid),
+                        "{}-{}".format(
+                            pokemonid, ORAS_EXTRA_SPRITE_NAMES[pokemonid][0]),
+                    ))
+
+            # Don't know what flag 1 is; everything has it.
+            # Flag 2 means the first alternate form is a female variant.
+            if flags & 2:
+                assert form_count > 0
+                form_count -= 1
+                model_num += 1
+                filenames[model_num] = "female/{}".format(pokemonid)
+            # Flag 4 just means there are more forms?
+            if flags & 4:
+                assert form_count
+
+            assert 1 or 1 == sum((
+                form_count == 0,
+                num_megas[pokemonid] > 0,
+                pokemonid in ORAS_EXTRA_SPRITE_NAMES,
+            ))
+            if num_megas[pokemonid]:
+                assert form_count == num_megas[pokemonid]
+                assert pokemonid not in ORAS_EXTRA_SPRITE_NAMES
+                model_num += 1
+                if form_count == 1:
+                    filenames[model_num] = "{}-mega".format(pokemonid)
+                else:
+                    # Charizard and Mewtwo
+                    assert form_count == 2
+                    filenames[model_num] = "{}-mega-x".format(pokemonid)
+                    filenames[model_num + 1] = "{}-mega-y".format(pokemonid)
+            elif pokemonid in ORAS_EXTRA_SPRITE_NAMES:
+                for form_name in ORAS_EXTRA_SPRITE_NAMES[pokemonid][1:]:
+                    model_num += 1
+                    filenames[model_num] = "{}-{}".format(pokemonid, form_name)
+
+    # And now, do the ripping
+    # TODO This will save Unown A as 201.png, and not create a 201-a.png
+    pokemon_sprites_dir = out
+    with read_garc(root / 'rom/a/2/6/3') as garc:
+        from .lib.clim import decode_clim
+        for i, subfile in enumerate(garc):
+            shiny_prefix = ''
+            if i > total_model_count:
+                i -= total_model_count
+                shiny_prefix = 'shiny/'
+
+            if i == 0:
+                # Dummy blank sprite, not interesting to us
+                continue
+            elif 37 <= i <= 41:
+                # Cosplay Pikachu's outfits -- the sprites are blank, so saving
+                # these is not particularly useful
+                continue
+            elif i in filenames:
+                filename = shiny_prefix + filenames[i] + '.png'
+            else:
+                raise ValueError(
+                    "Can't find a filename for sprite number {}".format(i))
+
+            data = subfile[0].read()
+            width, height, color_depth, pixels = decode_clim(data)
+            png_writer = png.Writer(
+                width=width,
+                height=height,
+                alpha=True,
+            )
+
+            # this library is so fucking stupid
+            # TODO strictly speaking we could just write out a paletted PNG directly
+            # TODO add sBIT chunk indicating original bit depth
+            path = pokemon_sprites_dir / filename
+            parent = path.parent
+            if not parent.exists():
+                parent.mkdir(parents=False)
+
+            with path.open('wb') as f:
+                png_writer.write(f, (itertools.chain(*row) for row in pixels))
+
+    for source, dest in duplicate_filenames:
+        shutil.copyfile(
+            str(pokemon_sprites_dir / source) + '.png',
+            str(pokemon_sprites_dir / dest) + '.png',
+        )
+
+
+def _munge_source_arg(strpath):
+    path = Path(strpath)
+    if not path.is_dir():
+        raise argparse.ArgumentTypeError(
+            "{!r} is not a directory".format(strpath))
+
+    # TODO something something romfs, exefs
+    return path
+
+def make_arg_parser():
+    p = argparse.ArgumentParser()
+    p.add_argument('what', choices=('data', 'dex-sprites', 'box-sprites'), help='what to extract')
+    # TODO should verify that this is an actual game dump, and find the rom/exe
+    p.add_argument('source', type=_munge_source_arg, help='path to an unpacked game image')
+    p.add_argument('dest', type=_munge_source_arg, help='directory to dump the results into')
+
+    return p
+
+
+def main(args):
+    parser = make_arg_parser()
+    args = parser.parse_args(args)
+
+    # TODO support 'all', and just make some subdirectories per thing
+    # TODO or maybe merge all the sprite things together since stuff will need moving around anyway idk
+    if args.what == 'data':
+        extract_data(args.source, args.dest)
+    elif args.what == 'dex-sprites':
+        extract_dex_sprites(args.source, args.dest)
+    elif args.what == 'box-sprites':
+        extract_box_sprites(args.source, args.dest)
+
+
+if __name__ == '__main__':
+    import sys
+    main(sys.argv[1:])