From 053f2a8d22124e2931ddfcaedd48bc4ec6e5deb3 Mon Sep 17 00:00:00 2001
From: "Eevee (Lexy Munroe)" <eevee.git@veekun.com>
Date: Thu, 5 Jan 2017 04:57:05 -0800
Subject: [PATCH] Use YAML schema for gen 6/7; add gen7 form names; improved
 image support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Specifically:

- Add support for detecting FLIM format

- Add support for more color formats

- Add a small decoded image type that knows how to write itself out as
  a PNG

- Improve ETC1 decoder to work with images whose dimensions are not
  powers of two, images with no alpha channel, and images with the
  strange FLIM pixel order

- Port the gen 6/7 extractor to Construct 2.8

- Switch to using script tags in language names, to distinguish Japanese
  kana from kanji and Simplified from Traditional Chinese

- Drop the load-time merging of kanji and kana

- Add paths to various text files in SUMO

- Add form names for SUMO Pokémon

- Clean up identifiers a bit, especially the distinction between species
  and Pokémon

- Use the Pokémon schema type to dump what we have so far, and give it a
  couple more fields that didn't exist in gen 1

- Get movesets dumping correctly

- Special-case a bunch of weirdness, where the number of dex sprites
  doesn't match the number of models in SUMO
---
 pokedex/extract/lib/clim.py | 222 ++++++++-
 pokedex/extract/lib/etc1.py |  58 ++-
 pokedex/extract/lib/garc.py |  48 +-
 pokedex/extract/lib/pc.py   |   2 +-
 pokedex/extract/oras.py     | 870 ++++++++++++++++++++++--------------
 pokedex/schema.py           |  21 +
 6 files changed, 846 insertions(+), 375 deletions(-)

diff --git a/pokedex/extract/lib/clim.py b/pokedex/extract/lib/clim.py
index 3723b50..3329855 100644
--- a/pokedex/extract/lib/clim.py
+++ b/pokedex/extract/lib/clim.py
@@ -1,3 +1,5 @@
+import io
+import itertools
 import math
 import struct
 
@@ -17,7 +19,7 @@ imag_header_struct = c.Struct(
     'section_length' / c.Const(c.Int32ul, 0x10),
     'width' / c.Int16ul,
     'height' / c.Int16ul,
-    'format' / c.Int32ul,
+    #'format' / c.Int32ul,
     # TODO this seems to have been expanded into several things in SUMO
     #c.Enum(
     #    c.ULInt32('format'),
@@ -37,13 +39,35 @@ imag_header_struct = c.Struct(
     #    A4=13,
     #    #ETC1=19,
     #)
+    'unknown' / c.Int16ul,
+    'format' / c.Enum(
+        c.Int8ul,
+        L8=0,
+        A8=1,
+        LA4=2,
+        LA8=3,
+        HILO8=4,
+        RGB565=5,
+        RGB8=6,
+        RGBA5551=7,
+        RGBA4=8,
+        RGBA8=9,
+        ETC1=10,
+        ETC1A4=11,
+        L4=12,
+        A4=13,
+        #ETC1=19,
+    ),
+    # RGB565=5,
+    # ETC1A4=11,
+    'unknown2' / c.Int8ul,
 )
 
 
 # TODO probably move these to their own module, since they aren't just for
 # CLIM.  pixel deshuffler, too.  (which should probably spit out pypng's native
 # format)
-COLOR_DECODERS = {}
+COLOR_FORMATS = {}
 
 
 @attr.s
@@ -59,22 +83,68 @@ class ColorFormat:
 
     def __iter__(self):
         # TODO back compat until i fix the below code
-        return iter((self.decoder, self.bits_per_pixel, self.bit_depth))
+        return iter((self, self.bits_per_pixel, self.bit_depth))
 
 
 def _register_color_decoder(name, *, bpp, depth, alpha):
     def register(f):
-        COLOR_DECODERS[name] = ColorFormat(name, f, bpp, depth, alpha)
+        COLOR_FORMATS[name] = ColorFormat(name, f, bpp, depth, alpha)
         return f
     return register
 
 
+@_register_color_decoder('A4', bpp=0.5, depth=4, alpha=True)
+def decode_A4(data):
+    for a in data:
+        a0 = a & 0xf
+        a0 = (a0 << 4) | (a0 << 0)
+        a1 = a >> 4
+        a1 = (a1 << 4) | (a1 << 0)
+        yield 0, 0, 0, a0
+        yield 0, 0, 0, a1
+
+
+@_register_color_decoder('A8', bpp=1, depth=8, alpha=True)
+def decode_a8(data):
+    for a in data:
+        yield 0, 0, 0, a
+
+
+@_register_color_decoder('L4', bpp=0.5, depth=4, alpha=False)
+def decode_l4(data):
+    for l in data:
+        l0 = l & 0xf
+        l0 = (l0 << 4) | (l0 << 0)
+        l1 = l >> 4
+        l1 = (l1 << 4) | (l1 << 0)
+        yield l0, l0, l0
+        yield l1, l1, l1
+
+
 @_register_color_decoder('L8', bpp=1, depth=8, alpha=False)
 def decode_l8(data):
     for l in data:
         yield l, l, l
 
 
+@_register_color_decoder('LA4', bpp=1, depth=4, alpha=True)
+def decode_la4(data):
+    for la in data:
+        l = la >> 4
+        l = (l << 4) | (l << 0)
+        a = (la >> 0) & 0xf
+        a = (a << 4) | (a << 4)
+        yield l, l, l, a
+
+
+@_register_color_decoder('LA8', bpp=2, depth=8, alpha=True)
+def decode_la8(data):
+    for i in range(0, len(data), 2):
+        a = data[i]
+        l = data[i + 1]
+        yield l, l, l, a
+
+
 @_register_color_decoder('RGBA4', bpp=2, depth=4, alpha=True)
 def decode_rgba4(data):
     # The idea is that every uint16 is a packed rrrrggggbbbbaaaa, but when
@@ -93,15 +163,16 @@ def decode_rgba4(data):
 @_register_color_decoder('RGB8', bpp=3, depth=8, alpha=False)
 def decode_rgb8(data):
     for i in range(0, len(data), 3):
-        yield data[i:i + 3]
+        yield data[i:i + 3][::-1]
 
 
 @_register_color_decoder('RGBA8', bpp=4, depth=8, alpha=True)
 def decode_rgba8(data):
     for i in range(0, len(data), 4):
-        yield data[i:i + 4]
+        yield data[i:i + 4][::-1]
 
 
+# FIXME turns out the above just are these, so, ditch these
 @_register_color_decoder('BGR8', bpp=3, depth=8, alpha=False)
 def decode_bgr8(data):
     for i in range(0, len(data), 3):
@@ -125,6 +196,7 @@ def decode_rgba5551(data, *, start=0, count=None):
 
     for i in range(start, end, 2):
         datum = data[i] + data[i + 1] * 256
+        # FIXME repeat rather than doing division
         r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
         g = (((datum >> 6) & 0x1f) * 255 + 15) // 31
         b = (((datum >> 1) & 0x1f) * 255 + 15) // 31
@@ -132,6 +204,45 @@ def decode_rgba5551(data, *, start=0, count=None):
         yield r, g, b, a
 
 
+@_register_color_decoder('RGB565', bpp=2, depth=5, alpha=False)
+def decode_rgb565(data, *, start=0, count=None):
+    # FIXME i bet construct totally /can/ parse this mess for me
+    if count is None:
+        end = len(data)
+    else:
+        end = start + count * 2
+
+    for i in range(start, end, 2):
+        datum = data[i] + data[i + 1] * 256
+        # FIXME repeat rather than doing division
+        r = (((datum >> 11) & 0x1f) * 255 + 15) // 31
+        g = (((datum >> 5) & 0x3f) * 255 + 31) // 63
+        b = (((datum >> 0) & 0x1f) * 255 + 15) // 31
+        yield r, g, b
+
+
+@_register_color_decoder('RGB332', bpp=1, depth=2, alpha=False)
+def decode_rgb332(data, *, start=0, count=None):
+    if count is None:
+        end = len(data)
+    else:
+        end = start + count
+
+    for i in range(start, end):
+        datum = data[i]
+        r = (datum >> 5) & 0x7
+        r = (r << 5) | (r << 2) | (r >> 1)
+        g = (datum >> 2) & 0x7
+        g = (g << 5) | (g << 2) | (g >> 1)
+        b = (datum >> 0) & 0x7
+        b = (b << 5) | (b << 2) | (b >> 1)
+        yield r, g, b
+
+
+_register_color_decoder('ETC1', bpp=0.5, depth=4, alpha=False)(None)
+_register_color_decoder('ETC1A4', bpp=1, depth=4, alpha=True)(None)
+
+
 del _register_color_decoder
 
 
@@ -156,12 +267,24 @@ def untile_pixels(raw_pixels, width, height, *, is_flim):
     Taken from: https://github.com/Zhorken/pokemon-x-y-icons/
     """
 
+    # FIXME this is a wild guess, because i've seen a 4x4 image that this just
+    # doesn't handle correctly, but the image is all white so i have no idea
+    # what the right fix is -- there's a 4 x 0x78 in 0/7/9 though...
+    if width < 8 or height < 8:
+        pixels = []
+        it = iter(raw_pixels)
+        for r in range(height):
+            pixels.append([])
+            for c in range(width):
+                pixels[-1].append(next(it))
+        return pixels
+
     # Images are stored padded to powers of two
     stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
     stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
     num_pixels = stored_width * stored_height
-    tile_width = stored_width // 8
-    tile_height = stored_height // 8
+    tile_width = (stored_width + 7) // 8
+    tile_height = (stored_height + 7) // 8
 
     pixels = [
         [None for x in range(width)]
@@ -175,6 +298,7 @@ def untile_pixels(raw_pixels, width, height, *, is_flim):
         # Find the coordinates of the top-left corner of the current tile.
         # n.b. The image is eight tiles wide, and each tile is 8×8 pixels.
         tile_num = n // 64
+        # FIXME i found a 4x4 FLIM that this fails for???
         if is_flim:
             # The FLIM format seems to pseudo-rotate the entire image to the
             # right, so tiles start in the bottom left and go up
@@ -224,25 +348,38 @@ def decode_clim(data):
         raise ValueError("Unknown image format {}".format(file_format))
 
     imag_header = imag_header_struct.parse(data[-20:])
-    if is_flim:
-        # TODO SUMO hack; not sure how to get format out of this header
-        imag_header.format = 'RGBA5551'
+    #if is_flim:
+    #    # TODO SUMO hack; not sure how to get format out of this header
+    #    imag_header.format = 'RGBA5551'
 
-    if imag_header.format not in COLOR_DECODERS:
+    if imag_header.format not in COLOR_FORMATS:
         raise ValueError(
             "don't know how to decode {} pixels".format(imag_header.format))
-    color_decoder, color_bpp, color_depth = COLOR_DECODERS[imag_header.format]
+    color_format = COLOR_FORMATS[imag_header.format]
 
     mode, = struct.unpack_from('<H', data, 0)
     if mode == 2:
         # Paletted
         palette_length, = struct.unpack_from('<H', data, 2)
-        palette = list(color_decoder(data, start=4, count=palette_length))
-        data_start = 4 + palette_length * color_bpp
+        palette = list(color_format.decoder(data, start=4, count=palette_length))
+        data_start = 4 + palette_length * color_format.bits_per_pixel
         scrambled_pixels = uncuddle_paletted_pixels(palette, data[data_start:])
+    elif imag_header.format == 'ETC1':
+        # FIXME merge this decoder in (problem is it needs to know width +
+        # height -- maybe i can move the pixel unscrambling out of it somehow?)
+        from .etc1 import decode_etc1
+        pixels = decode_etc1(b'\x00' * 0x80 + data, imag_header.width, imag_header.height, use_alpha=False, is_flim=True)[4]
+        return DecodedImageData(
+            imag_header.width, imag_header.height, color_format, None, pixels)
+    elif imag_header.format == 'ETC1A4':
+        # FIXME same
+        from .etc1 import decode_etc1
+        pixels = decode_etc1(b'\x00' * 0x80 + data, imag_header.width, imag_header.height, is_flim=True)[4]
+        return DecodedImageData(
+            imag_header.width, imag_header.height, color_format, None, pixels)
     else:
         palette = None
-        scrambled_pixels = color_decoder(data)
+        scrambled_pixels = color_format.decoder(data)
 
     pixels = untile_pixels(
         scrambled_pixels,
@@ -250,4 +387,55 @@ def decode_clim(data):
         imag_header.height,
         is_flim=is_flim,
     )
-    return imag_header.width, imag_header.height, color_depth, palette, pixels
+    return DecodedImageData(
+        imag_header.width, imag_header.height, color_format, palette, pixels)
+
+
+class DecodedImageData:
+    def __init__(self, width, height, color_format, palette, pixels):
+        self.width = width
+        self.height = height
+        self.color_format = color_format
+        self.palette = palette
+        self.pixels = pixels
+
+    def __iter__(self):
+        return iter((self.width, self.height, self.color_format.bit_depth, self.palette, self.pixels))
+
+    def mirror(self):
+        for row in self.pixels:
+            row.reverse()
+
+    def write_to_png(self, f):
+        """Write the results of ``decode_clim`` to a file object."""
+        import png
+
+        writer_kwargs = dict(width=self.width, height=self.height)
+        if self.palette:
+            writer_kwargs['palette'] = self.palette
+        if self.color_format.alpha:
+            # TODO do i really only need alpha=True if there's no palette?
+            writer_kwargs['alpha'] = True
+        writer = png.Writer(**writer_kwargs)
+
+        # For a paletted image, I want to preserve Zhorken's good idea of
+        # indicating the original bit depth with an sBIT chunk.  But PyPNG can't do
+        # that directly, so instead I have to do some nonsense.
+        # FIXME should probably just do that for everything?
+        if self.palette:
+            buf = io.BytesIO()
+            writer.write(buf, self.pixels)
+
+            # Read the PNG as chunks, and manually add an sBIT chunk
+            buf.seek(0)
+            png_reader = png.Reader(buf)
+            chunks = list(png_reader.chunks())
+            sbit = bytes([self.color_format.bit_depth] * 3)
+            chunks.insert(1, ('sBIT', sbit))
+
+            # Now write the chunks to the file
+            png.write_chunks(f, chunks)
+
+        else:
+            # Otherwise, it's...  almost straightforward.
+            writer.write(f, (itertools.chain(*row) for row in self.pixels))
diff --git a/pokedex/extract/lib/etc1.py b/pokedex/extract/lib/etc1.py
index b41913f..31a7efb 100644
--- a/pokedex/extract/lib/etc1.py
+++ b/pokedex/extract/lib/etc1.py
@@ -8,6 +8,7 @@ that decodes four 4x4 blocks one 8x8 block at a time, because of course it is.
 (I believe the 3DS operates with 8x8 tiles, so this does make some sense.)
 """
 import io
+import math
 
 # Easier than doing math
 THREE_BIT_TWOS_COMPLEMENT = [0, 1, 2, 3, -4, -3, -2, -1]
@@ -40,28 +41,34 @@ def clamp_to_byte(n):
     return max(0, min(255, n))
 
 
-def decode_etc1(data):
-    # TODO sizes are hardcoded here
-    width = 128
-    height = 128
-
+# FIXME sizes are hardcoded here
+def decode_etc1(data, width=128, height=128, use_alpha=True, is_flim=False):
     # TODO this seems a little redundant; could just ask for a stream
     f = io.BytesIO(data)
     # Skip header
     f.read(0x80)
 
-    outpixels = [[None] * width for _ in range(height)]
+    # Images are stored padded to powers of two
+    stored_width = 2 ** math.ceil(math.log(width) / math.log(2))
+    stored_height = 2 ** math.ceil(math.log(height) / math.log(2))
+
+    outpixels = [[None] * (width) for _ in range(height)]
     # ETC1 encodes as 4x4 blocks.  Normal ETC1 arranges them in English reading
     # order, right and down.  This Nintendo variant groups them as 8x8
     # superblocks, where the four blocks in each superblock are themselves
     # arranged right and down.  So we read block offsets 8 at a time, and 'z'
     # is our current position within a superblock.
     # TODO this may do the wrong thing if width/height is not divisible by 8
-    for blocky in range(0, height, 8):
-        for blockx in range(0, width, 8):
+    for blocky in range(0, stored_height, 8):
+        for blockx in range(0, stored_width, 8):
             for z in range(4):
-                row = f.read(16)
-                if not row:
+                if use_alpha:
+                    row = f.read(16)
+                else:
+                    # FIXME this could sure be incorporated better
+                    row = b'\xff' * 8 + f.read(8)
+                if len(row) < 16:
+                    print(row, blocky, blockx, z, f.tell() - 0x80, len(data) - 0x80)
                     raise EOFError
 
                 # Each block is encoded as 16 bytes.  The first 8 are a 4-bit
@@ -126,16 +133,28 @@ def decode_etc1(data):
                 base1 = red1, green1, blue1
                 base2 = red2, green2, blue2
 
+                # FLIM images do this truly bizarre thing where they write out the columns, as rows
+                if is_flim:
+                    block = (blocky // 8) * (stored_width // 8) + (blockx // 8)
+                    x0 = block // (stored_height // 8) * 8 + z // 2 * 4
+                    y0 = block % (stored_height // 8) * 8 + z % 2 * 4
+                else:
+                    x0 = blockx + z % 2 * 4
+                    y0 = blocky + z // 2 * 4
+
                 # Now deal with individual pixels
                 it = iter_alpha_nybbles(alpha)
                 for c in range(4):
                     for r in range(4):
-                        x = blockx + c
-                        y = blocky + r
-                        if z in (1, 3):
-                            x += 4
-                        if z in (2, 3):
-                            y += 4
+                        if is_flim:
+                            x = x0 + r
+                            y = y0 + c
+                        else:
+                            x = x0 + c
+                            y = y0 + r
+
+                        if not (x < width and y < height):
+                            continue
 
                         if (flipbit and r < 2) or (not flipbit and c < 2):
                             table = table1
@@ -149,8 +168,11 @@ def decode_etc1(data):
                         lobit = (lopixelbits >> pixelbit) & 0x1
                         mod = table[hibit * 2 + lobit]
                         color = tuple(clamp_to_byte(b + mod) for b in base)
-                        color += (next(it),)
+                        if use_alpha:
+                            color += (next(it),)
                         outpixels[y][x] = color
 
     # 4 is the bit depth; None is the palette
-    return width, height, 4, None, outpixels
+    from .clim import DecodedImageData, COLOR_FORMATS
+    # FIXME stupid import, wrong color format
+    return DecodedImageData(width, height, COLOR_FORMATS['ETC1A4'], None, outpixels)
diff --git a/pokedex/extract/lib/garc.py b/pokedex/extract/lib/garc.py
index 88a3d0c..065d738 100644
--- a/pokedex/extract/lib/garc.py
+++ b/pokedex/extract/lib/garc.py
@@ -94,13 +94,16 @@ class GARCEntry(object):
     def __getitem__(self, i):
         start, length = self.slices[i]
         ss = self.stream.slice(start, length)
-        if ss.peek(1) in [b'\x10', b'\x11']:
+        if ss.peek(1) in b'\x10\x11':
             # XXX this sucks but there's no real way to know for sure whether
             # data is compressed or not.  maybe just bake this into the caller
             # and let them deal with it, same way we do with text decoding?
             # TODO it would be nice if this could be done lazily for 'inspect'
             # purposes, since the first four bytes are enough to tell you the
             # size
+            # FIXME make this work even for red herrings, maybe by finishing it
+            # up and doing a trial decompression of the first x bytes
+            #return CompressedStream(ss)
             try:
                 data = lzss3.decompress_bytes(ss.read())
             except Exception:
@@ -113,6 +116,47 @@ class GARCEntry(object):
         return len(self.slices)
 
 
+class CompressedStream:
+    def __init__(self, stream):
+        self.stream = stream
+        header = stream.read(4)
+        stream.seek(0)
+        assert header[0] in b'\x10\x11'
+        self.length, = struct.unpack('<L', header[1:] + b'\x00')
+        self.data = None
+
+    def __len__(self):
+        return self.length
+
+    def _decompress(self):
+        self.data = BytesIO(lzss3.decompress_bytes(self.stream.read()))
+
+    def read(self, *args):
+        if self.data is None:
+            self._decompress()
+        return self.data.read(*args)
+
+    def seek(self, *args):
+        if self.data is None:
+            self._decompress()
+        return self.data.seek(*args)
+
+    def tell(self, *args):
+        if self.data is None:
+            self._decompress()
+        return self.data.tell(*args)
+
+    def peek(self, n):
+        if self.data is None:
+            self._decompress()
+        here = self.data.tell()
+        ret = self.data.read(n)
+        self.data.seek(here)
+        return ret
+
+
+
+
 XY_CHAR_MAP = {
     0x307f: 0x202f,  # nbsp
     0xe08d: 0x2026,  # ellipsis
@@ -360,7 +404,7 @@ def do_inspect(args):
                 else:
                     print()
 
-                cutoff = max(total_subfiles // 10, 2)
+                cutoff = max(total_subfiles // 10, 1)
                 for magic, ct in magic_ctr.most_common():
                     if ct < cutoff:
                         break
diff --git a/pokedex/extract/lib/pc.py b/pokedex/extract/lib/pc.py
index fc4a4ed..078ad9e 100644
--- a/pokedex/extract/lib/pc.py
+++ b/pokedex/extract/lib/pc.py
@@ -11,7 +11,7 @@ class PokemonContainerFile(_ContainerFile):
         self.stream = stream = Substream(stream)
 
         magic, entry_ct = stream.unpack('<2sH')
-        assert magic == b'PC'
+        assert magic in (b'PC', b'PS', b'BL')
 
         # Offsets are "A B C ...", where entry 0 ranges from A to B, entry 1
         # from B to C, etc.
diff --git a/pokedex/extract/oras.py b/pokedex/extract/oras.py
index 2a65581..2b58102 100644
--- a/pokedex/extract/oras.py
+++ b/pokedex/extract/oras.py
@@ -3,24 +3,30 @@
 Filesystem reference: http://www.projectpokemon.org/wiki/ORAS_File_System
 """
 import argparse
-from collections import Counter
 from collections import OrderedDict
 from collections import defaultdict
 from contextlib import contextmanager
-import io
-import itertools
-import math
 from pathlib import Path
 import re
 import shutil
 import struct
+import warnings
 
-from construct import Array, BitField, Bitwise, Magic, OptionalGreedyRange, Padding, Pointer, Struct, SLInt8, SLInt16, ULInt8, ULInt16, ULInt32
-import png
+from construct import (
+    # Simple fields
+    Const, Flag, Int16sl, Int16ul, Int8sl, Int8ul, Int32ul, Padding,
+    # Structures and meta stuff
+    Array, BitsInteger, BitsSwapped, Bitwise, Enum, Filter, FocusedSeq,
+    GreedyRange, Pointer, PrefixedArray, Range, Struct, this,
+    # temp
+    Peek, Bytes,
+)
 import yaml
 
+from pokedex.schema import Pokémon
 from .lib.garc import GARCFile, decrypt_xy_text
-from .lib.text import merge_japanese_texts
+
+# TODO: ribbons!  080 in sumo
 
 # TODO auto-detect rom vs romfs vs...  whatever
 
@@ -30,11 +36,45 @@ from .lib.text import merge_japanese_texts
 
 # TODO would be nice to have meaningful names for the file structure instead of sprinkling hardcoded ones throughout
 
+# SUMO file list:
+# a/2/8/1   "photos" from the credits
 
-CANON_LANGUAGES = ('ja', 'en', 'fr', 'it', 'de', 'es', 'ko')
+GROWTH_RATES = {
+    0: 'gr.medium',
+    1: 'gr.slow-then-very-fast',
+    2: 'gr.fast-then-very-slow',
+    3: 'gr.medium-slow',
+    4: 'gr.fast',
+    5: 'gr.slow',
+}
+TYPES = {
+    0: 't.normal',
+    1: 't.fighting',
+    2: 't.flying',
+    3: 't.poison',
+    4: 't.ground',
+    5: 't.rock',
+    6: 't.bug',
+    7: 't.ghost',
+    8: 't.steel',
+    9: 't.fire',
+    10: 't.water',
+    11: 't.grass',
+    12: 't.electric',
+    13: 't.psychic',
+    14: 't.ice',
+    15: 't.dragon',
+    16: 't.dark',
+    17: 't.fairy',
+}
+
+# ja-Hrkt: hiragana/katakana
+# zh-Hans: simplified
+# zh-Hant: traditional
+CANON_LANGUAGES = ('ja-Hrkt', 'ja', 'en', 'fr', 'it', 'de', 'es', 'ko', 'zh-Hans', 'zh-Hant')
 ORAS_SCRIPT_FILES = {
-    'ja-kana': 'rom/a/0/7/1',
-    'ja-kanji': 'rom/a/0/7/2',
+    'ja-Hrkt': 'rom/a/0/7/1',
+    'ja': 'rom/a/0/7/2',
     'en': 'rom/a/0/7/3',
     'fr': 'rom/a/0/7/4',
     'it': 'rom/a/0/7/5',
@@ -43,16 +83,16 @@ ORAS_SCRIPT_FILES = {
     'ko': 'rom/a/0/7/8',
 }
 SUMO_SCRIPT_FILES = {
-    'ja-kana': 'rom/a/0/3/0',
-    'ja-kanji': 'rom/a/0/3/1',
+    'ja-Hrkt': 'rom/a/0/3/0',
+    'ja': 'rom/a/0/3/1',
     'en': 'rom/a/0/3/2',
     'fr': 'rom/a/0/3/3',
     'it': 'rom/a/0/3/4',
     'de': 'rom/a/0/3/5',
     'es': 'rom/a/0/3/6',
     'ko': 'rom/a/0/3/7',
-    'zh-simplified': 'rom/a/0/3/8',
-    'zh-traditional': 'rom/a/0/3/9',
+    'zh-Hans': 'rom/a/0/3/8',
+    'zh-Hant': 'rom/a/0/3/9',
 }
 ORAS_SCRIPT_ENTRIES = {
     'form-names': 5,
@@ -118,34 +158,69 @@ SUMO_SCRIPT_ENTRIES = {
     'item-names': 36,  # singular
     'item-flavor': 35,
 }
-# The first element in each list is the name of the BASE form -- if it's not
-# None, the base form will be saved under two filenames
-ORAS_EXTRA_SPRITE_NAMES = {
+# The first element in each list is the name of the BASE form.
+# If it's None, then the base form is a true default in some sense, and it'll
+# have the same name as the species.  Mega Evolutions are a good example.
+# Otherwise, there is no default; the form name will differ from the species
+# name, and the first sprite will be saved under both names, e.g., Shellos.
+# Note that this does NOT include megas -- those are pulled from game data.
+FORM_NAMES = {
+    # TODO alolan are of course new in SUMO
+    # Rattata and Raticate
+    19: (None, 'alola'),
+    20: (None, 'alola', 'totem-alola'),
     # Cosplay Pikachu
-    25: (None, 'rock-star', 'belle', 'pop-star', 'phd', 'libre', 'cosplay'),
+    # TODO not in SUMO
+    #25: (None, 'rock-star', 'belle', 'pop-star', 'phd', 'libre', 'cosplay'),
+    25: (None, 'original-cap', 'hoenn-cap', 'sinnoh-cap', 'unova-cap', 'kalos-cap', 'alola-cap'),
+    # Raichu
+    26: (None, 'alola'),
+    # Sandshrew and Sandslash
+    27: (None, 'alola'),
+    28: (None, 'alola'),
+    # Vulpix and Ninetales
+    37: (None, 'alola'),
+    38: (None, 'alola'),
+    # Diglett and Dugtrio
+    50: (None, 'alola'),
+    51: (None, 'alola'),
+    # Meowth and Persian
+    52: (None, 'alola'),
+    53: (None, 'alola'),
+    # Geodude, Graveler, and Golem
+    74: (None, 'alola'),
+    75: (None, 'alola'),
+    76: (None, 'alola'),
+    # Geodude, Graveler, and Golem
+    88: (None, 'alola'),
+    89: (None, 'alola'),
+    # Exeggutor
+    103: (None, 'alola'),
+    # Marowak
+    105: (None, 'alola'),
     # Unown
     201: tuple('abcdefghijklmnopqrstuvwxyz') + ('exclamation', 'question'),
     # Castform
     351: (None, 'sunny', 'rainy', 'snowy'),
     # Kyogre and Groudon
-    382: (None, 'primal',),
-    383: (None, 'primal',),
+    382: (None, 'primal'),
+    383: (None, 'primal'),
     # Deoxys
     386: ('normal', 'attack', 'defense', 'speed'),
     # Burmy and Wormadam
     412: ('plant', 'sandy', 'trash'),
     413: ('plant', 'sandy', 'trash'),
     # Cherrim
-    421: ('overcast', 'sunshine',),
+    421: ('overcast', 'sunshine'),
     # Shellos and Gastrodon
-    422: ('west', 'east',),
-    423: ('west', 'east',),
+    422: ('west', 'east'),
+    423: ('west', 'east'),
     # Rotom
     479: (None, 'heat', 'wash', 'frost', 'fan', 'mow'),
     # Giratina
-    487: ('altered', 'origin',),
+    487: ('altered', 'origin'),
     # Shaymin
-    492: ('land', 'sky',),
+    492: ('land', 'sky'),
     # Arceus
     493: (
         'normal', 'fighting', 'flying', 'poison', 'ground', 'rock', 'bug',
@@ -153,9 +228,9 @@ ORAS_EXTRA_SPRITE_NAMES = {
         'ice', 'dragon', 'dark', 'fairy',
     ),
     # Basculin
-    550: ('red-striped', 'blue-striped',),
+    550: ('red-striped', 'blue-striped'),
     # Darmanitan
-    555: ('standard', 'zen',),
+    555: ('standard', 'zen'),
     # Deerling and Sawsbuck
     585: ('spring', 'summer', 'autumn', 'winter'),
     586: ('spring', 'summer', 'autumn', 'winter'),
@@ -171,6 +246,10 @@ ORAS_EXTRA_SPRITE_NAMES = {
     648: ('aria', 'pirouette'),
     # Genesect
     649: (None, 'douse', 'shock', 'burn', 'chill'),
+    # Greninja
+    # TODO SUMO only
+    # FIXME why is the second one here at all?
+    658: (None, 'dupe', 'ash'),
     # Vivillon
     666: (
         'icy-snow', 'polar', 'tundra', 'continental', 'garden', 'elegant',
@@ -197,179 +276,188 @@ ORAS_EXTRA_SPRITE_NAMES = {
     711: ('average', 'small', 'large', 'super'),
     # Xerneas
     716: ('neutral', 'active'),
+    # Zygarde
+    # TODO SUMO only
+    # TODO why are 10 and 50 duplicated?
+    718: (None, '10', '10', '50', 'complete'),
     # Hoopa
     720: ('confined', 'unbound'),
+    # Gumshoos
+    735: (None, 'totem'),
+    # Vikavolt
+    738: (None, 'totem'),
+    # Oricorio
+    741: ('baile', 'pom-pom', 'pau', 'sensu'),
+    # Lycanroc
+    745: ('midday', 'midnight'),
+    # Wishiwashi
+    746: ('solo', 'school'),
+    # Lurantis
+    754: (None, 'totem'),
+    # Salazzle
+    758: (None, 'totem'),
+    # Silvally
+    773: (
+        'normal', 'fighting', 'flying', 'poison', 'ground', 'rock', 'bug',
+        'ghost', 'steel', 'fire', 'water', 'grass', 'electric', 'psychic',
+        'ice', 'dragon', 'dark', 'fairy',
+    ),
+    # Minior
+    774: (
+        'red-meteor', 'orange-meteor', 'yellow-meteor', 'green-meteor',
+        'blue-meteor', 'indigo-meteor', 'violet-meteor',
+        'red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet',
+    ),
+    # Mimikyu
+    778: ('disguised', 'busted', 'totem-disguised', 'totem-busted'),
+    # Kommo-o
+    784: (None, 'totem'),
+    # Magearna
+    801: (None, 'original'),
 }
 
 
 pokemon_struct = Struct(
-    'pokemon',
-    ULInt8('stat_hp'),
-    ULInt8('stat_atk'),
-    ULInt8('stat_def'),
-    ULInt8('stat_speed'),
-    ULInt8('stat_spatk'),
-    ULInt8('stat_spdef'),
-    ULInt8('type1'),
-    ULInt8('type2'),
-    ULInt8('catch_rate'),
-    ULInt8('stage'),
-    ULInt16('effort'),
-    ULInt16('held_item1'),
-    ULInt16('held_item2'),
-    ULInt16('held_item3'),  # dark grass from bw, unused in oras?
-    ULInt8('gender_rate'),
-    ULInt8('steps_to_hatch'),
-    ULInt8('base_happiness'),
-    ULInt8('exp_curve'),
-    ULInt8('egg_group1'),
-    ULInt8('egg_group2'),
-    ULInt8('ability1'),
-    ULInt8('ability2'),
-    ULInt8('ability_dream'),
-    ULInt8('safari_escape'),
-    ULInt16('form_species_start'),
-    ULInt16('form_sprite_start'),
-    ULInt8('form_count'),
-    ULInt8('color'),
-    ULInt16('base_exp'),
-    ULInt16('height'),
-    ULInt16('weight'),
-    Bitwise(
-        BitField('machines', 14 * 8, swapped=True),
-    ),
+    'stat_hp' / Int8ul,
+    'stat_atk' / Int8ul,
+    'stat_def' / Int8ul,
+    'stat_speed' / Int8ul,
+    'stat_spatk' / Int8ul,
+    'stat_spdef' / Int8ul,
+    'type1' / Int8ul,
+    'type2' / Int8ul,
+    'capture_rate' / Int8ul,
+    'stage' / Int8ul,
+    'effort' / Int16ul,
+    'held_item1' / Int16ul,
+    'held_item2' / Int16ul,
+    'held_item3' / Int16ul,  # dark grass from bw, unused in oras?
+    'gender_rate' / Int8ul,
+    'steps_to_hatch' / Int8ul,
+    'base_happiness' / Int8ul,
+    'growth_rate' / Enum(Int8ul, **{v: k for (k, v) in GROWTH_RATES.items()}),
+    'egg_group1' / Int8ul,
+    'egg_group2' / Int8ul,
+    'ability1' / Int8ul,
+    'ability2' / Int8ul,
+    'ability_hidden' / Int8ul,
+    'safari_escape' / Int8ul,
+    'form_species_start' / Int16ul,
+    'form_sprite_start' / Int16ul,
+    'form_count' / Int8ul,
+    'color' / Int8ul,
+    'base_exp' / Int16ul,
+    'height' / Int16ul,
+    'weight' / Int16ul,
+    'machines' / BitsSwapped(Bitwise(Array(14 * 8, Flag))),
     Padding(2),
-    ULInt32('tutors'),
-    ULInt16('mystery1'),
-    ULInt16('mystery2'),
-    ULInt32('bp_tutors1'),  # unused in sumo
-    ULInt32('bp_tutors2'),  # unused in sumo
-    ULInt32('bp_tutors3'),  # unused in sumo
-    ULInt32('bp_tutors4'),  # sumo: big numbers for pikachu, eevee, snorlax, mew, starter evos, couple others??  maybe special z-move item?
+    'tutors' / Int32ul,
+    'mystery1' / Int16ul,
+    'mystery2' / Int16ul,
+    'bp_tutors1' / Int32ul,  # unused in sumo
+    'bp_tutors2' / Int32ul,  # unused in sumo
+    'bp_tutors3' / Int32ul,  # unused in sumo
+    'bp_tutors4' / Int32ul,  # sumo: big numbers for pikachu, eevee, snorlax, mew, starter evos, couple others??  maybe special z-move item?
     # TODO sumo is four bytes longer, not sure why, find out if those bytes are anything and a better way to express them
-    OptionalGreedyRange(Magic(b'\x00')),
+    GreedyRange(Const(b'\x00')),
 )
 
-pokemon_mega_evolutions_struct = Array(
-    2,  # NOTE: 3 for XY/ORAS, but i don't think the third is ever populated?
+pokemon_mega_evolutions_struct = Filter(this.number != 0, Range(
+    # XY and ORAS have 3 of these, but the third never seems to be populated.
+    # SUMO just has 2.
+    2, 3,
     Struct(
-        'pokemon_mega_evolutions',
-        ULInt16('number'),
-        ULInt16('mode'),
-        ULInt16('mega_stone_itemid'),
+        'number' / Int16ul,
+        'mode' / Int16ul,
+        'mega_stone_itemid' / Int16ul,
         Padding(2),
     )
+))
+
+egg_moves_struct = Struct(
+    'moveids' / PrefixedArray(Int16ul, Int16ul),
 )
 
 egg_moves_struct = Struct(
-    'egg_moves',
-    ULInt16('count'),
-    Array(
-        lambda ctx: ctx.count,
-        ULInt16('moveids'),
-    ),
+    'first_form_id' / Int16ul,  # TODO SUMO ONLY
+    'moveids' / PrefixedArray(Int16ul, Int16ul),
 )
 
-egg_moves_struct = Struct(
-    'egg_moves',
-    ULInt16('first_form_id'),  # TODO SUMO ONLY
-    ULInt16('count'),
-    Array(
-        lambda ctx: ctx.count,
-        ULInt16('moveids'),
-    ),
-)
-
-level_up_moves_struct = OptionalGreedyRange(
+level_up_moves_struct = GreedyRange(
     Struct(
-        'level_up_pair',
-        SLInt16('moveid'),
-        SLInt16('level'),
+        'moveid' / Int16sl,
+        'level' / Int16sl,
     ),
 )
 
 move_struct = Struct(
-    'move',
-    ULInt8('type'),
-    ULInt8('category'),
-    ULInt8('damage_class'),
-    ULInt8('power'),
-    ULInt8('accuracy'),
-    ULInt8('pp'),
-    SLInt8('priority'),
-    ULInt8('min_max_hits'),
-    SLInt16('caused_effect'),
-    ULInt8('effect_chance'),
-    ULInt8('status'),
-    ULInt8('min_turns'),
-    ULInt8('max_turns'),
-    ULInt8('crit_rate'),
-    ULInt8('flinch_chance'),
-    ULInt16('effect'),
-    SLInt8('recoil'),
-    ULInt8('healing'),
-    ULInt8('range'),            # ok
-    Bitwise(
-        BitField('stat_change', 24),
-    ),
-    Bitwise(
-        BitField('stat_amount', 24),
-    ),
-    Bitwise(
-        BitField('stat_chance', 24),
-    ),
-    ULInt8('padding0'),         # ok
-    ULInt8('padding1'),         # ok
-    ULInt16('flags'),
-    ULInt8('padding2'),         # ok
-    ULInt8('extra'),
+    'type' / Enum(Int8ul, **{v:k for (k, v) in TYPES.items()}),
+    'category' / Int8ul,
+    'damage_class' / Int8ul,
+    'power' / Int8ul,
+    'accuracy' / Int8ul,
+    'pp' / Int8ul,
+    'priority' / Int8sl,
+    'min_max_hits' / Int8ul,
+    'caused_effect' / Int16sl,
+    'effect_chance' / Int8ul,
+    'status' / Int8ul,
+    'min_turns' / Int8ul,
+    'max_turns' / Int8ul,
+    'crit_rate' / Int8ul,
+    'flinch_chance' / Int8ul,
+    'effect' / Int16ul,
+    'recoil' / Int8sl,
+    'healing' / Int8ul,
+    'range' / Int8ul,            # ok
+    'stat_change' / Bitwise(Array(6, BitsInteger(4))),
+    'stat_amount' / Bitwise(Array(6, BitsInteger(4))),
+    'stat_chance' / Bitwise(Array(6, BitsInteger(4))),
+    'padding0' / Int8ul,         # ok
+    'padding1' / Int8ul,         # ok
+    'flags' / Int16ul,
+    'padding2' / Int8ul,         # ok
+    'extra' / Int8ul,
+    # FIXME unsure whether this exists in ORAS; should use a length limiter in the parent
+    'extra2' / Int32ul,
 )
-move_container_struct = Struct(
-    'move_container',
-    Magic(b'WD'),  # waza...  descriptions?
-    ULInt16('record_ct'),
-    Array(
-        lambda ctx: ctx.record_ct,
-        Struct(
-            'records',
-            ULInt32('offset'),
-            Pointer(lambda ctx: ctx.offset, move_struct),
-        ),
-    ),
+move_container_struct = FocusedSeq('records',
+    Const(b'WD'),  # waza...  descriptions?
+    'records' / PrefixedArray(Int16ul, FocusedSeq('move',
+        'offset' / Int32ul,
+        'move' / Pointer(this.offset, move_struct),
+    )),
 )
 
 pokemon_sprite_struct = Struct(
-    'pokemon_sprite_config',
-    ULInt16('index'),
-    ULInt16('female_index'),
-    ULInt32('form_index_offset'),
-    ULInt32('right_index_offset'),
-    ULInt16('form_count'),
-    ULInt16('right_count'),
+    'index' / Int16ul,
+    'female_index' / Int16ul,
+    'form_index_offset' / Int32ul,
+    'right_index_offset' / Int32ul,
+    'form_count' / Int16ul,
+    'right_count' / Int16ul,
 )
 
 encounter_struct = Struct(
-    'encounter',
     # TODO top 5 bits are form stuff
-    ULInt16('pokemon_id'),
-    ULInt8('min_level'),
-    ULInt8('max_level'),
+    'pokemon_id' / Int16ul,
+    'min_level' / Int8ul,
+    'max_level' / Int8ul,
 )
 
 encounter_table_struct = Struct(
-    'encounter_table',
-    ULInt8('walk_rate'),
-    ULInt8('long_grass_rate'),
-    ULInt8('hidden_rate'),
-    ULInt8('surf_rate'),
-    ULInt8('rock_smash_rate'),
-    ULInt8('old_rod_rate'),
-    ULInt8('good_rod_rate'),
-    ULInt8('super_rod_rate'),
-    ULInt8('horde_rate'),
-    Magic(b'\x00' * 5),
+    'walk_rate' / Int8ul,
+    'long_grass_rate' / Int8ul,
+    'hidden_rate' / Int8ul,
+    'surf_rate' / Int8ul,
+    'rock_smash_rate' / Int8ul,
+    'old_rod_rate' / Int8ul,
+    'good_rod_rate' / Int8ul,
+    'super_rod_rate' / Int8ul,
+    'horde_rate' / Int8ul,
+    Const(b'\x00' * 5),
     Array(61, encounter_struct),
-    Magic(b'\x00' * 2),
+    Const(b'\x00' * 2),
 )
 
 ORAS_ENCOUNTER_SLOTS = [
@@ -666,49 +754,15 @@ def extract_data(root, out):
                 entry = garc[entryid][0]
                 texts[lang][entryname] = decrypt_xy_text(entry.read())
 
-    # Japanese text is special!  It's written in both kanji and kana, and we
-    # want to combine them
-    texts['ja'] = {}
-    #for entryname in ORAS_SCRIPT_ENTRIES:
-    for entryname in SUMO_SCRIPT_ENTRIES:
-        kanjis = texts['ja-kanji'][entryname]
-        kanas = texts['ja-kana'][entryname]
-        # But not if they're names of things.
-        # (TODO this might not be true in the case of, say, towns?  in which
-        # case, what do we do?  we want to ultimately put these in urls and
-        # whatnot, right, but we don't want furigana there  :S  do we need a
-        # separate "identifier" field /per language/?)
-        assert len(kanas) == len(kanjis)
-        if kanjis == kanas:
-            texts['ja'][entryname] = kanjis
-        else:
-            texts['ja'][entryname] = [
-                merge_japanese_texts(kanji, kana)
-                for (kanji, kana) in zip(kanjis, kanas)
-            ]
-    del texts['ja-kanji']
-    del texts['ja-kana']
-
     identifiers = {}
-    identifiers['species'] = []
-    for i, (species_name, form_name) in enumerate(itertools.zip_longest(
-            texts['en']['species-names'],
-            texts['en']['form-names'],
-            )):
-        if species_name:
-            ident = make_identifier(species_name)
-        else:
-            # TODO proooooobably fix this
-            ident = 'uhhhhh'
-            #print("??????", i, species_name, form_name)
-        if form_name:
-            ident = ident + '-' + make_identifier(form_name)
-        # TODO hold up, how are these /species/ identifiers?
-        identifiers['species'].append(ident)
-    identifiers['move'] = [
-        make_identifier(name)
-        for name in texts['en']['move-names']
-    ]
+    # FIXME should use a known list, mayyybe compare against this
+    identifiers['species'] = list(map(make_identifier, texts['en']['species-names']))
+    # This is totally wrong, but the Pokémon loop below fixes it as it goes
+    # FIXME okay that bit at the end is dumb
+    identifiers['pokémon'] = identifiers['species'][:] + [None] * 1000
+    identifiers['move'] = list(map(make_identifier, texts['en']['move-names']))
+    identifiers['item'] = list(map(make_identifier, texts['en']['item-names']))
+    identifiers['ability'] = list(map(make_identifier, texts['en']['ability-names']))
 
     textdir = out / 'script'
     if not textdir.exists():
@@ -722,6 +776,8 @@ def extract_data(root, out):
 
     """
     # Encounters
+    22:42 < magical> note to self: X/Y ambush encounters are found in DllField.cro, starting at 0xf40d0
+    23:02 < magical> friend safari pokemon at 0x13d34a
     # TODO move mee elsewheeere -- actually all of these should be in their own pieces
     places = OrderedDict()
     name_index_to_place = {}
@@ -853,7 +909,7 @@ def extract_data(root, out):
                                 levels = str(enc.min_level)
                             else:
                                 levels = "{} - {}".format(enc.min_level, enc.max_level)
-                            pokemon_ident = identifiers['species'][enc.pokemon_id & 0x1ff]
+                            pokemon_ident = identifiers['pokémon'][enc.pokemon_id & 0x1ff]
                             pokemon_form_bits = enc.pokemon_id >> 9
                             # TODO maybe turn this into, i have no idea, a
                             # custom type?  something forcibly short??
@@ -892,7 +948,9 @@ def extract_data(root, out):
         machines = []
         #f.seek(0x004a67ee)  # ORAS
         f.seek(0x0049795a)  # SUMO
+        # TODO magic number (107)
         machineids = struct.unpack('<107H', f.read(2 * 107))
+        # TODO dunno if this is still true
         # Order appears to be based on some gen 4 legacy: TMs 1 through 92, HMs
         # 1 through 6, then the other eight TMs and the last HM.  But the bits
         # in the Pokémon structs are in the expected order of 1 through 100, 1
@@ -909,43 +967,141 @@ def extract_data(root, out):
 
     # -------------------------------------------------------------------------
     # Pokémon structs
-    # TODO SUMO 0/1/8 seems to contain the index for the "base" species
+    mega_evolutions = get_mega_evolutions(root)
+    all_pokémon = OrderedDict()
     pokemon_data = []
     with read_garc(root / 'rom/a/0/1/7') as garc:  # SUMO
     #with read_garc(root / 'rom/a/1/9/5') as garc:  # ORAS
         personals = [subfile[0].read() for subfile in garc]
     _pokemon_forms = {}  # "real" species id => (base species id, form name id)
-    _next_name_form_id = 723  # TODO magic number
+    _next_name_form_id = 803#723  # TODO magic numbers
+    print("number of flavor texts", len(texts['en']['species-flavor-moon']))
     for i, personal in enumerate(personals[:-1]):
         record = pokemon_struct.parse(personal)
-        # TODO transform to an OD somehow probably
-        pokemon_data.append(record)
-        print(i, hex(record.bp_tutors4))
-        #print("{:3d} {:15s} {} {:5d} {:5d}".format(
-        #    i,
-        #    identifiers['species'][baseid],
-        #    ('0'*16 + bin(record.mystery1)[2:])[-16:],
-        #    record.mystery2,
-        #    record.stage,
-        #))
-        # TODO some pokemon have sprite starts but no species start, because their sprites vary obv
-        if record.form_count > 1:
-            # The form names appear to be all just jammed at the end in order,
-            # completely unrelated to either of the "start" offsets here
+
+        # FIRST THINGS FIRST: let's deal with forms.
+        # TODO some pokemon, like unown, /only/ have sprite variations, so they
+        # don't have a form_species_start here.  what do i do about them?
+        if (record.form_count > 1) != bool(record.form_species_start):
+            print("!!! sprite-only forms, argh")
+        # The > i check makes sure we don't run this bit a second time when we
+        # hit the forms themselves
+        if record.form_count > 1 and record.form_species_start > i:
+            megas = mega_evolutions[i]
+            if len(megas) == 1:
+                assert i not in FORM_NAMES
+                form_names = ['mega']
+            elif len(megas) == 2:
+                assert i not in FORM_NAMES
+                form_names = ['mega-x', 'mega-y']
+            else:
+                assert not megas
+                form_names = FORM_NAMES[i][1:]
+                # Fix our own name if necessary
+                if FORM_NAMES[i][0]:
+                    identifiers['pokémon'][i] += '-' + FORM_NAMES[i][0]
+
+            if record.form_count - 1 != len(form_names):
+                print("!!!!! MISMATCH", record.form_count - 1, len(form_names))
             for offset in range(record.form_count - 1):
+                # Form names appear to be all just jammed at the end in order,
+                # completely unrelated to either of the "start" offsets here
                 #form_name = texts['en']['form-names'][_next_name_form_id]
 
-                if record.form_species_start:
-                    # TODO still no idea how "intangible" forms are being
-                    # handled in the new schema
-                    _pokemon_forms[record.form_species_start + offset] = i, _next_name_form_id
-
+                # TODO still no idea how "intangible" forms are being
+                # handled in the new schema
+                _pokemon_forms[record.form_species_start + offset] = i, _next_name_form_id
                 _next_name_form_id += 1
 
-        if record.form_species_start:
-            for offset in range(record.form_count - 1):
-                # TODO grab the form names argh
-                identifiers['species'][record.form_species_start + offset] = identifiers['species'][i]
+                identifiers['pokémon'][record.form_species_start + offset] = identifiers['species'][i] + '-' + form_names[offset]
+
+        pokémon = Pokémon()
+        all_pokémon[identifiers['pokémon'][i]] = pokémon
+        pokémon.game_index = i
+
+        if i in _pokemon_forms:
+            base_species_id, form_name_id = _pokemon_forms[i]
+        else:
+            base_species_id = i
+            form_name_id = i
+        # TODO i observe this is explicitly a species name, the one thing that
+        # really is shared between forms
+        pokémon.name = OrderedDict(
+            (language, texts[language]['species-names'][base_species_id])
+            for language in CANON_LANGUAGES)
+        pokémon.genus = OrderedDict(
+            (language, texts[language]['genus-names'][base_species_id])
+            for language in CANON_LANGUAGES)
+        # FIXME ho ho, hang on a second, forms have their own flavor text too!!
+        pokémon.flavor_text = OrderedDict(
+            # TODO well this depends on which game you're dumping
+            (language, texts[language]['species-flavor-moon'][base_species_id])
+            for language in CANON_LANGUAGES)
+        # FIXME include form names?  only when they exist?  can that be
+        # inconsistent between languages?
+
+        pokémon.base_stats = {
+            'hp': record.stat_hp,
+            'attack': record.stat_atk,
+            'defense': record.stat_def,
+            'special-attack': record.stat_spatk,
+            'special-defense': record.stat_spdef,
+            'speed': record.stat_speed,
+        }
+        # FIXME pokémon.types = [record.type1]
+        pokémon.capture_rate = record.capture_rate
+        # TODO stage?
+        # FIXME effort
+        # Held items are a bit goofy; if the same item is in all three slots, it always appears!
+        pokémon.held_items = {}
+        if 0 != record.held_item1 == record.held_item2 == record.held_item3:
+            pokémon.held_items[identifiers['item'][record.held_item1]] = 100
+        else:
+            if record.held_item1:
+                pokémon.held_items[identifiers['item'][record.held_item1]] = 50
+            if record.held_item2:
+                pokémon.held_items[identifiers['item'][record.held_item2]] = 5
+            if record.held_item3:
+                pokémon.held_items[identifiers['item'][record.held_item3]] = 1
+
+        # TODO i think this needs some normalizing?  maybe renaming because
+        # this doesn't at all imply what it means
+        pokémon.gender_rate = record.gender_rate
+
+        pokémon.hatch_counter = record.steps_to_hatch
+        pokémon.base_happiness = record.base_happiness
+        pokémon.growth_rate = record.growth_rate
+        # FIXME egg groups
+        pokémon.abilities = [
+            identifiers['ability'][ability]
+            for ability in (record.ability1, record.ability2, record.ability_hidden)
+        ]
+        # FIXME safari escape??
+        # FIXME form stuff
+        # FIXME color
+        pokémon.base_experience = record.base_exp
+        # FIXME what units are these!
+        pokémon.height = record.height
+        pokémon.weight = record.weight
+
+        pokémon.moves = {}
+
+
+
+
+        # TODO transform to an OD somehow probably
+        pokemon_data.append(record)
+        print("{:4d} {:25s} {} {:5d} {:5d} {:20s} {:4d} {:4d} {:2d}".format(
+            i,
+            identifiers['pokémon'][i],
+            ('0'*16 + bin(record.mystery1)[2:])[-16:],
+            record.mystery2,
+            record.stage,
+            texts['en']['form-names'][i],
+            record.form_species_start,
+            record.form_sprite_start,
+            record.form_count,
+        ))
 
     #for i in range(723, 825 + 1):
     #    base_species_id, form_name_id = _pokemon_forms[i]
@@ -955,31 +1111,42 @@ def extract_data(root, out):
 
     # -------------------------------------------------------------------------
     # Move stats
-    movesets = OrderedDict()
-    with read_garc(root / 'rom/a/0/1/1') as garc:  # SUMO
     #with read_garc(root / 'rom/a/1/8/9') as garc:  # ORAS
+    with read_garc(root / 'rom/a/0/1/1') as garc:  # SUMO
         # Only one subfile
+        # TODO assert this wherever i do it
         data = garc[0][0].read()
-        container = move_container_struct.parse(data)
-        for n, record in enumerate(container.records):
-            m = record.move
+        print(Struct('magic' / Bytes(2), 'count' / Int16ul, 'pointers' / Array(16, Int32ul)).parse(data))
+        print(move_struct.sizeof())
+        records = move_container_struct.parse(data)
+        for i, record in enumerate(records):
+            #print(texts['en']['move-names'][i])
+            #print(record)
             # TODO with the release of oras all moves have contest types and effects again!  where are they??
-            #print("{:3d} {:20s} | {m.type:3d} {m.power:3d} {m.pp:2d} {m.accuracy:3d} / {m.priority:2d} {m.range:2d} {m.damage_class:1d} / {m.effect:3d} {m.caused_effect:3d} {m.effect_chance:3d}  --  {m.status:3d} {m.min_turns:3d} {m.max_turns:3d} {m.crit_rate:3d} {m.flinch_chance:3d} {m.recoil:4d} {m.healing:3d} / {m.stat_change:06x} {m.stat_amount:06x} {m.stat_chance:06x} / {m.padding0:3d} {m.padding1:3d} {m.flags:04x} {m.padding2:3d} {m.extra:3d}".format(
-            #    n,
-            #    identifiers['move'][n],
-            #    m=record.move,
-            #))
+            print("{:3d} {:30s} | {m.type:10s} {m.category:3d} / {m.power:3d} {m.pp:2d} {m.accuracy:3d} / {m.priority:2d} {m.range:2d} {m.damage_class:1d} / {m.effect:3d} {m.caused_effect:3d} {m.effect_chance:3d}  --  {m.status:3d} {m.min_turns:3d} {m.max_turns:3d} {m.crit_rate:3d} {m.flinch_chance:3d} {m.recoil:4d} {m.healing:3d} / {m.stat_change!r} {m.stat_amount!r} {m.stat_chance!r} ~ {m.padding0:3d} {m.padding1:3d} {m.flags:04x} {m.padding2:3d} {m.extra:3d} {m.extra2:10d}".format(
+                i,
+                texts['en']['move-names'][i],
+                m=record,
+            ))
+    return
 
     # Egg moves
     with read_garc(root / 'rom/a/0/1/2') as garc:  # SUMO
     #with read_garc(root / 'rom/a/1/9/0') as garc:  # ORAS
+        print("number of egg moves:", len(garc))
         for i, subfile in enumerate(garc):
-            ident = identifiers['species'][i]
+            ident = identifiers['pokémon'][i]
             data = subfile[0].read()
             if not data:
                 continue
             container = egg_moves_struct.parse(data)
-            moveset = movesets.setdefault(ident, OrderedDict())
+            print(i, ident, container.first_form_id, container.moveids)
+            # FIXME: 961 pokémon, 1063 named forms, but 1048 egg movesets.
+            # what?  they get completely out of order after 802 and i don't
+            # know how to fix this.  didn't magical write some code...?
+            if i > len(identifiers['species']):
+                continue
+            moveset = all_pokémon[ident].moves
             eggset = moveset['egg'] = []
             for moveid in container.moveids:
                 eggset.append(identifiers['move'][moveid])
@@ -987,10 +1154,11 @@ def extract_data(root, out):
     # Level-up moves
     with read_garc(root / 'rom/a/0/1/3') as garc:  # SUMO
     #with read_garc(root / 'rom/a/1/9/1') as garc:  # ORAS
+        print("number of level-up moves", len(garc))
         for i, subfile in enumerate(garc):
-            ident = identifiers['species'][i]
+            ident = identifiers['pokémon'][i]
             level_up_moves = subfile[0].read()
-            moveset = movesets.setdefault(ident, OrderedDict())
+            moveset = all_pokémon[ident].moves
             levelset = moveset['level'] = []
             lastlevel = None
             order = 1
@@ -998,10 +1166,11 @@ def extract_data(root, out):
                 # End is indicated with -1, -1
                 if pair.moveid <= 0:
                     break
-                levelset.append((
-                    pair.level,
-                    identifiers['move'][pair.moveid],
-                ))
+                # FIXME this is a goofy-looking structure, but it makes the
+                # yaml come out nicely?
+                levelset.append({
+                    pair.level: identifiers['move'][pair.moveid],
+                })
 
                 if pair.level == lastlevel:
                     order += 1
@@ -1038,8 +1207,8 @@ def extract_data(root, out):
 
     # Tutor moves (from the personal structs)
     for i, datum in enumerate(pokemon_data):
-        ident = identifiers['species'][i]
-        moveset = movesets.setdefault(ident, OrderedDict())
+        ident = identifiers['pokémon'][i]
+        moveset = all_pokémon[ident].moves
         tutorset = moveset['tutor'] = []
         for key, tutors in tutor_moves.items():
             for bit, moveident in enumerate(tutors):
@@ -1052,27 +1221,27 @@ def extract_data(root, out):
         # TMs
         machineset = moveset['machine'] = []
         for bit, moveident in enumerate(machines):
-            if not datum['machines'] & (1 << bit):
+            if not datum['machines'][bit]:
                 continue
             machineset.append(moveident)
 
-    with (out / 'movesets.yaml').open('w') as f:
-        dump_to_yaml(movesets, f)
+    with (out / 'pokemon.yaml').open('w') as f:
+        #dump_to_yaml(all_pokémon, f)
+        import pokedex.schema as schema
+        from camel import Camel
+        f.write(Camel([schema.POKEDEX_TYPES]).dump(all_pokémon))
 
 
-def get_mega_counts(root):
-    """Return a dict mapping Pokémon ids to how many mega evolutions each one
-    has.
+def get_mega_evolutions(root):
+    """Return a dict mapping Pokémon ids to a list of mega evolution records.
     """
-    mega_counts = {}  # pokemonid => number of mega evos
+    megas = {}
     #with read_garc(root / 'rom/a/1/9/3') as garc:  # oras
     with read_garc(root / 'rom/a/0/1/5') as garc:  # SUMO
         for pokemonid, subfile in enumerate(garc):
-            mega_evos = pokemon_mega_evolutions_struct.parse_stream(subfile[0])
-            mega_counts[pokemonid] = max(
-                mega_evo.number for mega_evo in mega_evos)
+            megas[pokemonid] = pokemon_mega_evolutions_struct.parse_stream(subfile[0])
 
-    return mega_counts
+    return megas
 
 
 class SpriteFileNamer:
@@ -1117,7 +1286,7 @@ class SpriteFileNamer:
                     .format(self.mega_counts[pokemonid], pokemonid))
         else:
             # TODO should use warnings for this so it works for new games
-            #raise ValueError("Pokemon {} doesn't have forms".format(pokemonid))
+            warnings.warn("Don't know any forms for Pokemon {}".format(pokemonid))
             form = "form-{}".format(formid)
 
         # Construct the directory
@@ -1192,50 +1361,30 @@ class SpriteFileNamer:
             shutil.copyfile(str(fn), str(fn2))
 
 
-def write_clim_to_png(f, width, height, color_depth, palette, pixels):
-    """Write the results of ``decode_clim`` to a file object."""
-    writer_kwargs = dict(width=width, height=height)
-    if palette:
-        writer_kwargs['palette'] = palette
-    else:
-        # TODO do i really only need alpha=True if there's no palette?
-        writer_kwargs['alpha'] = True
-    writer = png.Writer(**writer_kwargs)
-
-    # For a paletted image, I want to preserve Zhorken's good idea of
-    # indicating the original bit depth with an sBIT chunk.  But PyPNG can't do
-    # that directly, so instead I have to do some nonsense.
-    if palette:
-        buf = io.BytesIO()
-        writer.write(buf, pixels)
-
-        # Read the PNG as chunks, and manually add an sBIT chunk
-        buf.seek(0)
-        png_reader = png.Reader(buf)
-        chunks = list(png_reader.chunks())
-        sbit = bytes([color_depth] * 3)
-        chunks.insert(1, ('sBIT', sbit))
-
-        # Now write the chunks to the file
-        png.write_chunks(f, chunks)
-
-    else:
-        # Otherwise, it's...  almost straightforward.
-        writer.write(f, (itertools.chain(*row) for row in pixels))
-
-
 def extract_box_sprites(root, out):
-    namer = SpriteFileNamer(
-        out, get_mega_counts(root), ORAS_EXTRA_SPRITE_NAMES)
+    mega_counts = {
+        id: len(megas)
+        for (id, megas) in get_mega_evolutions(root).items()
+    }
+    namer = SpriteFileNamer(out, mega_counts, FORM_NAMES)
 
     with (root / 'exe/code.bin').open('rb') as f:
         # Form configuration, used to put sprites in the right order
         # NOTE: in x/y the address is 0x0043ea98
         #f.seek(0x0047d650)  # ORAS
         f.seek(0x004999d0)  # SUMO
-        # TODO magic number
-        for n in range(722):
+        # Discard dummy zero sprite
+        pokemon_sprite_struct.parse_stream(f)
+        n = 0
+        while True:
             sprite = pokemon_sprite_struct.parse_stream(f)
+            # This is not particularly reliable, but the data immediately
+            # following this list is some small 32-bit values, so the female
+            # index will be (illegally) zero
+            if not sprite.female_index:
+                break
+
+            n += 1
             namer.add(sprite.index, n)
             if sprite.female_index != sprite.index:
                 namer.add(sprite.female_index, n, female=True)
@@ -1289,26 +1438,28 @@ def extract_box_sprites(root, out):
     with read_garc(root / 'rom/a/0/6/2') as garc:  # SUMO
         from .lib.clim import decode_clim
         for i, subfile in enumerate(garc):
-            if i == 0:
-                # Dummy blank sprite, not interesting to us
-                continue
-            elif i == 333:
-                # Duplicate Entei sprite that's not used
-                continue
-            elif i == len(garc) - 1:
+            # TODO ORAS ONLY
+            #elif i == 333:
+            #    # Duplicate Entei sprite that's not used
+            #    continue
+            if i == len(garc) - 1:
                 # Very last one is egg
                 namer.inject(i, 'egg.png')
 
+            # TODO this is bad.
+            if not namer.index_to_filenames[i]:
+                # Unused sprite -- e.g. index 0, or one of the dummies in SUMO
+                continue
+
             data = subfile[0].read()
-            width, height, color_depth, palette, pixels = decode_clim(data)
+            image_data = decode_clim(data)
 
             # TODO this is bad.
             if 'right/' in namer.index_to_filenames[i][0]:
-                for row in pixels:
-                    row.reverse()
+                image_data.mirror()
 
             with namer.open(i) as f:
-                write_clim_to_png(f, width, height, color_depth, palette, pixels)
+                image_data.write_to_png(f)
 
 
 def extract_dex_sprites(root, out):
@@ -1317,58 +1468,103 @@ def extract_dex_sprites(root, out):
     # Luckily the dex sprites are in the same order as the models
     # (unsurprising, as they're just model renders), which also tells us what
     # Pokémon have female forms.  The mega evolution map tells us which forms
-    # are megas, and the rest are listed manually above as
-    # ORAS_EXTRA_SPRITE_NAMES.
+    # are megas, and the rest are listed manually above as FORM_NAMES.
 
-    namer = SpriteFileNamer(
-        out, get_mega_counts(root), ORAS_EXTRA_SPRITE_NAMES)
+    mega_counts = {
+        id: len(megas)
+        for (id, megas) in get_mega_evolutions(root).items()
+    }
+    namer = SpriteFileNamer(out, mega_counts, FORM_NAMES)
 
     # TODO Meowstic is counted as simply female in here, but should probably be
     # saved with a form filename as well
+    # TODO should skip the extra komala and the totem forms
     #with read_garc(root / 'rom/a/0/0/8') as garc:  # ORAS
     with read_garc(root / 'rom/a/0/9/4') as garc:  # SUMO
         f = garc[0][0]
-        # TODO magic number
-        for n in range(721):
-            # Unlike /virtually everywhere else/, Pokémon are zero-indexed here
-            pokemonid = n + 1
+        pokemonid = 0
+        while True:
+            pokemonid += 1
+            data = f.read(4)
+            # All zeroes means we're done.  Maybe.  More data follows after
+            # this, but it doesn't seem to be the same format, and I don't know
+            # what exactly it's for.
+            if data == b'\x00\x00\x00\x00':
+                break
+
             # Index of the first model (also zero-indexed), how many models the
             # Pokémon has, and some flags
-            start, count, flags = struct.unpack('<HBB', f.read(4))
-            model_num = start + 1
-            # For some asinine reason, Xerneas is counted as two separate
-            # Pokémon in the dex sprites but not the models, so we have to
-            # shift everything after it back by 1
+            start, count, flags = struct.unpack('<HBB', data)
+            # TODO this was CHANGED for SUMO -- for ORAS all the shiny sprites are a second block at the end!
+            #model_num = start + 1
+            model_num = start * 2 + 1
+            #print("pokemon {:3d} -- start {:4d} ({:4d}) -- count {:2d} -- flags {:08b}".format(pokemonid, start, model_num, count, flags))
+            # Fix a few odd disconnects between the model listing and the
+            # actual dex sprites.
+            # TODO there must be a dex sprite index somewhere, this is silly
+            # Xerneas has two models, but three dex sprites
             if pokemonid == 716:
                 count = 2
-            elif pokemonid >= 717:
-                model_num += 1
+            # Lurantis has two models, but one dex sprite
+            if pokemonid == 754:
+                count = 1
+                flags &= ~4
+            # Salazzle has two models, but one dex sprite
+            if pokemonid == 758:
+                count = 1
+                flags &= ~4
+            # Komala has one model, but two dex sprites
+            # FIXME probably skip extracting it at all
+            if pokemonid == 775:
+                count = 2
+            # The above all naturally throw later numbering off; compensate
+            if 716 < pokemonid <= 754:
+                model_num += 2
+            elif 758 < pokemonid <= 775:
+                model_num -= 2
 
             namer.add(model_num, pokemonid)
+            # TODO SUMO ONLY (should be += 1 for ORAS)
+            namer.add(model_num + 1, pokemonid, shiny=True)
+            model_num += 2
+
             form_count = count - 1  # discount "base" form
+            # TODO this is only used for ORAS, and should be done another way anyway
             total_model_count = model_num + count - 1
 
             # Don't know what flag 1 is; everything has it.
-            # Flag 2 means the first alternate form is a female variant.
+            # Flag 2 means the first alternate form is female.
             if flags & 2:
                 assert form_count > 0
                 form_count -= 1
-                model_num += 1
                 namer.add(model_num, pokemonid, female=True)
+                namer.add(model_num + 1, pokemonid, female=True, shiny=True)
+                model_num += 2
             # Flag 4 just means there are more forms?
             if flags & 4:
                 assert form_count
 
             for formid in range(1, form_count + 1):
-                model_num += 1
                 namer.add(model_num, pokemonid, formid)
+                namer.add(model_num + 1, pokemonid, formid, shiny=True)
+                model_num += 2
 
     # And now, do the ripping
     #with read_garc(root / 'rom/a/2/6/3') as garc:  # ORAS
-    with read_garc(root / 'rom/a/2/4/0') as garc:  # sun/moon demo
+    with read_garc(root / 'rom/a/2/4/0') as garc:  # SUMO
         from .lib.clim import decode_clim
         from .lib.etc1 import decode_etc1
         for i, subfile in enumerate(garc):
+            if i == 0:
+                # Dummy sprite, not interesting to us
+                continue
+
+            data = subfile[0].read()
+            """
+            with open("{}/{}.png".format(str(out), i), 'wb') as f:
+                write_clim_to_png(f, *decode_etc1(data))
+            continue
+            # TODO THIS IS ALL ORAS ONLY
             shiny_prefix = None
             if i > total_model_count:
                 i -= total_model_count
@@ -1376,18 +1572,18 @@ def extract_dex_sprites(root, out):
                 # hack in the other code
                 shiny_prefix = 'shiny/'
 
-            if i == 0:
-                # Dummy blank sprite, not interesting to us
-                continue
             elif 37 <= i <= 41:
                 # Cosplay Pikachu's outfits -- the sprites are blank, so saving
                 # these is not particularly useful
                 continue
+            """
 
             data = subfile[0].read()
-            with namer.open(i, prefix=shiny_prefix) as f:
-                write_clim_to_png(f, *decode_etc1(data))
-                #write_clim_to_png(f, *decode_clim(data))
+            with namer.open(i) as f:
+                decode_etc1(data).write_to_png(f)
+            # TODO ORAS
+            #with namer.open(i, prefix=shiny_prefix) as f:
+            #    decode_clim(data).write_to_png(f)
 
 
 def _munge_source_arg(strpath):
diff --git a/pokedex/schema.py b/pokedex/schema.py
index e22b47a..ab69216 100644
--- a/pokedex/schema.py
+++ b/pokedex/schema.py
@@ -164,6 +164,8 @@ Evolution = _ForwardDeclaration()
 EncounterMap = _ForwardDeclaration()
 MoveSet = _ForwardDeclaration()
 Pokedex = _ForwardDeclaration()
+Item = _ForwardDeclaration()
+Ability = _ForwardDeclaration()
 
 
 class Pokémon(VersionedLocus):
@@ -173,6 +175,9 @@ class Pokémon(VersionedLocus):
     base_stats = _Map(Stat, int)
     growth_rate = _Value(GrowthRate)
     base_experience = _Value(int, min=0, max=255)
+    capture_rate = _Value(int, min=0, max=255)
+    held_items = _Map(Item, int)
+    gender_rate = _Value(int)
 
     pokedex_numbers = _Map(Pokedex, int)
 
@@ -202,9 +207,25 @@ class Pokémon(VersionedLocus):
     # TODO should this be written in hex, maybe?
     game_index = _Value(int)
 
+    # FIXME how do i distinguish hidden ability?
+    abilities = _List(Ability)
+
 Pokemon = Pokémon
 
 
+MoveEffect = _ForwardDeclaration()
+
+class Move(VersionedLocus):
+    name = _Localized(str)
+    type = _Value(Type)
+    power = _Value(int)
+    pp = _Value(int)
+    accuracy = _Value(int)
+    effect = _Value(MoveEffect)
+
+
+
+
 # ------------------------------------------------------------------------------
 # The repository class, primary interface to the data