Write a gbz80 pattern matcher, for more robust address sniffing

With this, Pokémon names are finally detected correctly from both R/G and R/B. Hurrah. Yellow doesn't work yet, though. Sigh.
2024-08-20 18:16:34 +00:00 · 2016-08-21 16:44:07 -07:00 · 2016-08-21 16:44:07 -07:00 · 122da8d885
commit 122da8d885
parent 969d671c48
2 changed files with 841 additions and 88 deletions
--- a/pokedex/extract/lib/gbz80.py
+++ b/pokedex/extract/lib/gbz80.py
@ -0,0 +1,647 @@
+"""Stuff for dealing with the Game Boy's Z80-ish machine code.  Most notably,
+can do pattern-matching against a chunk of assembly with missing values.
+"""
+from collections import OrderedDict
+from collections import defaultdict
+import re
+
+import attr
+
+# TODO: would be nice to understand "cp a, #foo" and similar for xor/or/and/etc
+# TODO: would be AMAZING to understand labels when searching for code wow
+
+# This table is courtesy of pokemon-reverse-engineering-tools:
+# https://github.com/pret/pokemon-reverse-engineering-tools/blob/master/pokemontools/gbz80disasm.py
+gbz80_bitops = dict(enumerate([
+    "rlc b",       "rlc c",       "rlc d",       "rlc e",       "rlc h",       "rlc l",       "rlc [hl]",       "rlc a",       # $00 - $07
+    "rrc b",       "rrc c",       "rrc d",       "rrc e",       "rrc h",       "rrc l",       "rrc [hl]",       "rrc a",       # $08 - $0f
+    "rl b",        "rl c",        "rl d",        "rl e",        "rl h",        "rl l",        "rl [hl]",        "rl a",        # $10 - $17
+    "rr b",        "rr c",        "rr d",        "rr e",        "rr h",        "rr l",        "rr [hl]",        "rr a",        # $18 - $1f
+    "sla b",       "sla c",       "sla d",       "sla e",       "sla h",       "sla l",       "sla [hl]",       "sla a",       # $20 - $27
+    "sra b",       "sra c",       "sra d",       "sra e",       "sra h",       "sra l",       "sra [hl]",       "sra a",       # $28 - $2f
+    "swap b",      "swap c",      "swap d",      "swap e",      "swap h",      "swap l",      "swap [hl]",      "swap a",      # $30 - $37
+    "srl b",       "srl c",       "srl d",       "srl e",       "srl h",       "srl l",       "srl [hl]",       "srl a",       # $38 - $3f
+    "bit $00, b",  "bit $00, c",  "bit $00, d",  "bit $00, e",  "bit $00, h",  "bit $00, l",  "bit $00, [hl]",  "bit $00, a",  # $40 - $47
+    "bit $01, b",  "bit $01, c",  "bit $01, d",  "bit $01, e",  "bit $01, h",  "bit $01, l",  "bit $01, [hl]",  "bit $01, a",  # $48 - $4f
+    "bit $02, b",  "bit $02, c",  "bit $02, d",  "bit $02, e",  "bit $02, h",  "bit $02, l",  "bit $02, [hl]",  "bit $02, a",  # $50 - $57
+    "bit $03, b",  "bit $03, c",  "bit $03, d",  "bit $03, e",  "bit $03, h",  "bit $03, l",  "bit $03, [hl]",  "bit $03, a",  # $58 - $5f
+    "bit $04, b",  "bit $04, c",  "bit $04, d",  "bit $04, e",  "bit $04, h",  "bit $04, l",  "bit $04, [hl]",  "bit $04, a",  # $60 - $67
+    "bit $05, b",  "bit $05, c",  "bit $05, d",  "bit $05, e",  "bit $05, h",  "bit $05, l",  "bit $05, [hl]",  "bit $05, a",  # $68 - $6f
+    "bit $06, b",  "bit $06, c",  "bit $06, d",  "bit $06, e",  "bit $06, h",  "bit $06, l",  "bit $06, [hl]",  "bit $06, a",  # $70 - $77
+    "bit $07, b",  "bit $07, c",  "bit $07, d",  "bit $07, e",  "bit $07, h",  "bit $07, l",  "bit $07, [hl]",  "bit $07, a",  # $78 - $7f
+    "res $00, b",  "res $00, c",  "res $00, d",  "res $00, e",  "res $00, h",  "res $00, l",  "res $00, [hl]",  "res $00, a",  # $80 - $87
+    "res $01, b",  "res $01, c",  "res $01, d",  "res $01, e",  "res $01, h",  "res $01, l",  "res $01, [hl]",  "res $01, a",  # $88 - $8f
+    "res $02, b",  "res $02, c",  "res $02, d",  "res $02, e",  "res $02, h",  "res $02, l",  "res $02, [hl]",  "res $02, a",  # $90 - $97
+    "res $03, b",  "res $03, c",  "res $03, d",  "res $03, e",  "res $03, h",  "res $03, l",  "res $03, [hl]",  "res $03, a",  # $98 - $9f
+    "res $04, b",  "res $04, c",  "res $04, d",  "res $04, e",  "res $04, h",  "res $04, l",  "res $04, [hl]",  "res $04, a",  # $a0 - $a7
+    "res $05, b",  "res $05, c",  "res $05, d",  "res $05, e",  "res $05, h",  "res $05, l",  "res $05, [hl]",  "res $05, a",  # $a8 - $af
+    "res $06, b",  "res $06, c",  "res $06, d",  "res $06, e",  "res $06, h",  "res $06, l",  "res $06, [hl]",  "res $06, a",  # $b0 - $b7
+    "res $07, b",  "res $07, c",  "res $07, d",  "res $07, e",  "res $07, h",  "res $07, l",  "res $07, [hl]",  "res $07, a",  # $b8 - $bf
+    "set $00, b",  "set $00, c",  "set $00, d",  "set $00, e",  "set $00, h",  "set $00, l",  "set $00, [hl]",  "set $00, a",  # $c0 - $c7
+    "set $01, b",  "set $01, c",  "set $01, d",  "set $01, e",  "set $01, h",  "set $01, l",  "set $01, [hl]",  "set $01, a",  # $c8 - $cf
+    "set $02, b",  "set $02, c",  "set $02, d",  "set $02, e",  "set $02, h",  "set $02, l",  "set $02, [hl]",  "set $02, a",  # $d0 - $d7
+    "set $03, b",  "set $03, c",  "set $03, d",  "set $03, e",  "set $03, h",  "set $03, l",  "set $03, [hl]",  "set $03, a",  # $d8 - $df
+    "set $04, b",  "set $04, c",  "set $04, d",  "set $04, e",  "set $04, h",  "set $04, l",  "set $04, [hl]",  "set $04, a",  # $e0 - $e7
+    "set $05, b",  "set $05, c",  "set $05, d",  "set $05, e",  "set $05, h",  "set $05, l",  "set $05, [hl]",  "set $05, a",  # $e8 - $ef
+    "set $06, b",  "set $06, c",  "set $06, d",  "set $06, e",  "set $06, h",  "set $06, l",  "set $06, [hl]",  "set $06, a",  # $f0 - $f7
+    "set $07, b",  "set $07, c",  "set $07, d",  "set $07, e",  "set $07, h",  "set $07, l",  "set $07, [hl]",  "set $07, a"   # $f8 - $ff
+]))
+
+# This instruction list was carefully scraped from:
+# http://www.pastraiser.com/cpu/gameboy/gameboy_opcodes.html
+gbz80_instructions = {
+    0x00: 'nop',
+    0x01: 'ld bc, #d16',
+    0x02: 'ld [bc], a',
+    0x03: 'inc bc',
+    0x04: 'inc b',
+    0x05: 'dec b',
+    0x06: 'ld b, #d8',
+    0x07: 'rlca',
+    0x08: 'ld [#a16], sp',
+    0x09: 'add hl, bc',
+    0x0a: 'ld a, [bc]',
+    0x0b: 'dec bc',
+    0x0c: 'inc c',
+    0x0d: 'dec c',
+    0x0e: 'ld c, #d8',
+    0x0f: 'rrca',
+
+    0x10: 'stop',
+    0x11: 'ld de, #d16',
+    0x12: 'ld [de], a',
+    0x13: 'inc de',
+    0x14: 'inc d',
+    0x15: 'dec d',
+    0x16: 'ld d, #d8',
+    0x17: 'rla',
+    0x18: 'jr #r8',
+    0x19: 'add hl, de',
+    0x1a: 'ld a, [de]',
+    0x1b: 'dec de',
+    0x1c: 'inc e',
+    0x1d: 'dec e',
+    0x1e: 'ld e, #d8',
+    0x1f: 'rra',
+
+    0x20: 'jr nz, #r8',
+    0x21: 'ld hl, #d16',
+    0x22: 'ld [hl+], a',
+    0x23: 'inc hl',
+    0x24: 'inc h',
+    0x25: 'dec h',
+    0x26: 'ld h, #d8',
+    0x27: 'daa',
+    0x28: 'jr z, #r8',
+    0x29: 'add hl, hl',
+    0x2a: 'ld a, [hl+]',
+    0x2b: 'dec hl',
+    0x2c: 'inc l',
+    0x2d: 'dec l',
+    0x2e: 'ld l, #d8',
+    0x2f: 'cpl',
+
+    0x30: 'jr nc, #r8',
+    0x31: 'ld sp, #d16',
+    0x32: 'ld [hl-], a',
+    0x33: 'inc sp',
+    0x34: 'inc [hl]',
+    0x35: 'dec [hl]',
+    0x36: 'ld [hl], #d8',
+    0x37: 'scf',
+    0x38: 'jr c, #r8',
+    0x39: 'add hl, sp',
+    0x3a: 'ld a, [hl-]',
+    0x3b: 'dec sp',
+    0x3c: 'inc a',
+    0x3d: 'dec a',
+    0x3e: 'ld a, #d8',
+    0x3f: 'ccf',
+
+    0x40: 'ld b, b',
+    0x41: 'ld b, c',
+    0x42: 'ld b, d',
+    0x43: 'ld b, e',
+    0x44: 'ld b, h',
+    0x45: 'ld b, l',
+    0x46: 'ld b, [hl]',
+    0x47: 'ld b, a',
+    0x48: 'ld c, b',
+    0x49: 'ld c, c',
+    0x4a: 'ld c, d',
+    0x4b: 'ld c, e',
+    0x4c: 'ld c, h',
+    0x4d: 'ld c, l',
+    0x4e: 'ld c, [hl]',
+    0x4f: 'ld c, a',
+
+    0x50: 'ld d, b',
+    0x51: 'ld d, c',
+    0x52: 'ld d, d',
+    0x53: 'ld d, e',
+    0x54: 'ld d, h',
+    0x55: 'ld d, l',
+    0x56: 'ld d, [hl]',
+    0x57: 'ld d, a',
+    0x58: 'ld e, b',
+    0x59: 'ld e, c',
+    0x5a: 'ld e, d',
+    0x5b: 'ld e, e',
+    0x5c: 'ld e, h',
+    0x5d: 'ld e, l',
+    0x5e: 'ld e, [hl]',
+    0x5f: 'ld e, a',
+
+    0x60: 'ld h, b',
+    0x61: 'ld h, c',
+    0x62: 'ld h, d',
+    0x63: 'ld h, e',
+    0x64: 'ld h, h',
+    0x65: 'ld h, l',
+    0x66: 'ld h, [hl]',
+    0x67: 'ld h, a',
+    0x68: 'ld l, b',
+    0x69: 'ld l, c',
+    0x6a: 'ld l, d',
+    0x6b: 'ld l, e',
+    0x6c: 'ld l, h',
+    0x6d: 'ld l, l',
+    0x6e: 'ld l, [hl]',
+    0x6f: 'ld l, a',
+
+    0x70: 'ld [hl], b',
+    0x71: 'ld [hl], c',
+    0x72: 'ld [hl], d',
+    0x73: 'ld [hl], e',
+    0x74: 'ld [hl], h',
+    0x75: 'ld [hl], l',
+    0x76: 'halt',
+    0x77: 'ld [hl], a',
+    0x78: 'ld a, b',
+    0x79: 'ld a, c',
+    0x7a: 'ld a, d',
+    0x7b: 'ld a, e',
+    0x7c: 'ld a, h',
+    0x7d: 'ld a, l',
+    0x7e: 'ld a, [hl]',
+    0x7f: 'ld a, a',
+
+    0x80: 'add a, b',
+    0x81: 'add a, c',
+    0x82: 'add a, d',
+    0x83: 'add a, e',
+    0x84: 'add a, h',
+    0x85: 'add a, l',
+    0x86: 'add a, [hl]',
+    0x87: 'add a, a',
+    0x88: 'adc a, b',
+    0x89: 'adc a, c',
+    0x8a: 'adc a, d',
+    0x8b: 'adc a, e',
+    0x8c: 'adc a, h',
+    0x8d: 'adc a, l',
+    0x8e: 'adc a, [hl]',
+    0x8f: 'adc a, a',
+
+    0x90: 'sub b',
+    0x91: 'sub c',
+    0x92: 'sub d',
+    0x93: 'sub e',
+    0x94: 'sub h',
+    0x95: 'sub l',
+    0x96: 'sub [hl]',
+    0x97: 'sub a',
+    0x98: 'sbc a, b',
+    0x99: 'sbc a, c',
+    0x9a: 'sbc a, d',
+    0x9b: 'sbc a, e',
+    0x9c: 'sbc a, h',
+    0x9d: 'sbc a, l',
+    0x9e: 'sbc a, [hl]',
+    0x9f: 'sbc a, a',
+
+    0xa0: 'and b',
+    0xa1: 'and c',
+    0xa2: 'and d',
+    0xa3: 'and e',
+    0xa4: 'and h',
+    0xa5: 'and l',
+    0xa6: 'and [hl]',
+    0xa7: 'and a',
+    0xa8: 'xor b',
+    0xa9: 'xor c',
+    0xaa: 'xor d',
+    0xab: 'xor e',
+    0xac: 'xor h',
+    0xad: 'xor l',
+    0xae: 'xor [hl]',
+    0xaf: 'xor a',
+
+    0xb0: 'or b',
+    0xb1: 'or c',
+    0xb2: 'or d',
+    0xb3: 'or e',
+    0xb4: 'or h',
+    0xb5: 'or l',
+    0xb6: 'or [hl]',
+    0xb7: 'or a',
+    0xb8: 'cp b',
+    0xb9: 'cp c',
+    0xba: 'cp d',
+    0xbb: 'cp e',
+    0xbc: 'cp h',
+    0xbd: 'cp l',
+    0xbe: 'cp [hl]',
+    0xbf: 'cp a',
+
+    0xc0: 'ret nz',
+    0xc1: 'pop bc',
+    0xc2: 'jp nz, #a16',
+    0xc3: 'jp #a16',
+    0xc4: 'call nz, #a16',
+    0xc5: 'push bc',
+    0xc6: 'add a, #d8',
+    0xc7: 'rst $00',
+    0xc8: 'ret z',
+    0xc9: 'ret',
+    0xca: 'jp z, #a16',
+    0xcb: gbz80_bitops,
+    0xcc: 'call z, #a16',
+    0xcd: 'call #a16',
+    0xce: 'adc a, #d8',
+    0xcf: 'rst $08',
+
+    0xd0: 'ret nc',
+    0xd1: 'pop de',
+    0xd2: 'jp nc, #a16',
+    # 0xd3
+    0xd4: 'call nc, #a16',
+    0xd5: 'push de',
+    0xd6: 'sub #d8',
+    0xd7: 'rst $10',
+    0xd8: 'ret c',
+    0xd9: 'reti',
+    0xda: 'jp c, #a16',
+    # 0xdb
+    0xdc: 'call c, #a16',
+    # 0xdd
+    0xde: 'sbc a, #d8',
+    0xdf: 'rst $18',
+
+    0xe0: 'ldh [#a8], a',
+    0xe1: 'pop hl',
+    0xe2: 'ld [$ff00+c], a',  # XXX table claims 2 but this looks like 1 to me
+    # 0xe3
+    # 0xe4
+    0xe5: 'push hl',
+    0xe6: 'and #d8',
+    0xe7: 'rst $20',
+    0xe8: 'add sp, #r8',
+    0xe9: 'jp [hl]',
+    0xea: 'ld [#a16], a',
+    # 0xeb
+    # 0xec
+    # 0xed
+    0xee: 'xor #d8',
+    0xef: 'rst $28',
+
+    0xf0: 'ldh a, [#a8]',
+    0xf1: 'pop af',
+    0xf2: 'ld a, [$ff00+c]',  # XXX table says 1 but this looks like 1 to me
+    0xf3: 'di',
+    # 0xf4
+    0xf5: 'push af',
+    0xf6: 'or #d8',
+    0xf7: 'rst $30',
+    0xf8: 'ld hl, sp+#r8',
+    0xf9: 'ld sp, hl',
+    0xfa: 'ld a, [#a16]',
+    0xfb: 'ei',
+    # 0xfc
+    # 0xfd
+    0xfe: 'cp #d8',
+    0xff: 'rst $38',
+}
+
+
+class Atom:
+    is_constant = False
+    is_input = False
+    is_register = False
+
+    def is_compatible_with(self, other):
+        return self == other
+
+    def render(self):
+        raise NotImplementedError
+
+
+@attr.s
+class InputAtom(Atom):
+    name = attr.ib()
+    length = attr.ib()
+    is_input = True
+
+    def is_compatible_with(self, other):
+        # Inputs are compatible with anything that's not a register
+        # TODO does length matter?
+        return isinstance(other, (InputAtom, ConstantAtom))
+
+    def render(self, value=None):
+        if value is None:
+            return '#' + self.name
+        else:
+            return "${:0{width}x}".format(value, width=self.length * 2)
+
+
+@attr.s
+class RegisterAtom(Atom):
+    name = attr.ib()
+    is_register = True
+
+    def render(self):
+        return self.name
+
+
+@attr.s
+class ConstantAtom(Atom):
+    value = attr.ib()
+    is_constant = True
+
+    def render(self):
+        if self.value < 256:
+            return "${:02x}".format(self.value)
+        else:
+            return "${:04x}".format(self.value)
+
+
+class Instruction:
+    def __init__(self, syntax, prefix, mnemonic, length, args, inputs):
+        self.syntax = syntax
+        self.prefix = prefix
+        self.mnemonic = mnemonic
+        self.length = length
+        # list of (items_to_sum, is_pointer) tuples
+        self.args = args
+        self.inputs = inputs
+
+    @staticmethod
+    def partial_parse(syntax):
+        mnemonic, *argstrs = re.split('[, ]+', syntax)
+        args = []
+
+        for argstr in argstrs:
+            ptr = False
+            if argstr.startswith('[') and argstr.endswith(']'):
+                ptr = True
+                argstr = argstr[1:-1]
+
+            atoms = []
+            for atomstr in re.split('[+](?=.)', argstr):
+                if atomstr.startswith('#'):
+                    atom = InputAtom(atomstr[1:], None)
+                elif atomstr in (
+                        'a', 'f', 'b', 'c', 'd', 'e', 'h', 'l',
+                        'af', 'bc', 'de', 'hl', 'hl+', 'hl-',
+                        'sp', 'pc', 'z', 'nz', 'c', 'nc',
+                        ):
+                    atom = RegisterAtom(atomstr)
+                elif atomstr.startswith('$'):
+                    atom = ConstantAtom(int(atomstr[1:], 16))
+                elif atomstr.isdigit():
+                    atom = ConstantAtom(int(atomstr))
+                else:
+                    raise SyntaxError(
+                        "Unrecognized argument {!r} in instruction {!r}"
+                        .format(atomstr, syntax))
+
+                atoms.append(atom)
+
+            args.append((atoms, ptr))
+
+        return mnemonic, args
+
+    @classmethod
+    def parse(cls, syntax, prefix):
+        mnemonic, args = cls.partial_parse(syntax)
+        inputs = []
+
+        length = len(prefix)
+        for atoms, is_ptr in args:
+            for atom in atoms:
+                if not atom.is_input:
+                    continue
+                if atom.name in ('d16', 'a16'):
+                    atom.length = 2
+                elif atom.name in ('d8', 'a8', 'r8'):
+                    atom.length = 1
+                else:
+                    raise SyntaxError(
+                        "Unrecognized input name {}".format(atom.name))
+
+                inputs.append(atom)
+                length += atom.length
+
+        self = cls(syntax, prefix, mnemonic, length, args, inputs)
+        assert self(ignore_inputs=True) == syntax
+        return self
+
+    def __repr__(self):
+        return "<{} 0x{}: {}>".format(
+            type(self).__name__,
+            self.prefix.hex(),
+            self(ignore_inputs=True),
+        )
+
+    def __call__(self, *inputs, ignore_inputs=False):
+        inputs = list(inputs)
+        if not ignore_inputs:
+            if len(inputs) != len(self.inputs):
+                raise TypeError(
+                    "{} needs {} inputs, got {}"
+                    .format(self.syntax, len(self.inputs), len(inputs)))
+
+        args = []
+        for atoms, is_ptr in self.args:
+            atomstrs = []
+            for atom in atoms:
+                if not ignore_inputs and atom.is_input:
+                    atomstrs.append(atom.render(inputs.pop(0)))
+                else:
+                    atomstrs.append(atom.render())
+            expr = '+'.join(atomstrs)
+            if is_ptr:
+                expr = '[' + expr + ']'
+            args.append(expr)
+
+        out = self.mnemonic
+        if args:
+            out = out + ' ' + ', '.join(args)
+        return out
+
+    def match_inputs(self, mnemonic, args):
+        if self.mnemonic != mnemonic:
+            return
+        if len(self.args) != len(args):
+            return
+
+        # Compare args
+        input_pairs = []
+        for (atoms1, ptr1), (atoms2, ptr2) in zip(self.args, args):
+            if ptr1 != ptr2:
+                return
+            if len(atoms1) != len(atoms2):
+                return
+            # TODO technically, A+B is the same as B+A, but the lists are het
+            # so i can't sort
+            # TODO also, constant folding could make A+B the same as C
+            for atom1, atom2 in zip(atoms1, atoms2):
+                if not atom1.is_compatible_with(atom2):
+                    return
+                if atom1.is_input:
+                    input_pairs.append((atom1, atom2))
+
+        return input_pairs
+
+
+class InstructionSet:
+    def __init__(self, instructions):
+        self.instructions = {}
+        self.mnemonics = defaultdict(set)
+        self._load_instructions(instructions)
+
+    def _load_instructions(self, instructions, *, prefix=b''):
+        for n, syntax in instructions.items():
+            byte = bytes([n])
+            if isinstance(syntax, dict):
+                # Nested args, for the bitops instruction
+                self._load_instructions(syntax, prefix=byte)
+            else:
+                instr = Instruction.parse(syntax, prefix + byte)
+                self.instructions[prefix + byte] = instr
+                self.mnemonics[instr.mnemonic].add(instr)
+
+
+gbz80 = InstructionSet(gbz80_instructions)
+
+needle = """push bc
+push hl
+ld a, [#wd11e]
+dec a
+ld hl, #PokedexOrder
+ld b, 0
+ld c, a
+add hl, bc
+ld a, [hl]
+ld [#wd11e], a
+pop hl
+pop bc
+ret"""
+"""
+            \xc5        
+            \xe5        
+            \xfa (..)   
+            \x3d        
+            \x21 (..)   
+            \x06 \x00   
+            \x4f        
+            \x09        
+            \x7e        
+            \xea \1     
+            \xe1        
+            \xc1        
+            \xc9        
+            """
+haystack = b'\xc5\xe5\xfaXY\x3d\x21ZW\x06\x00\x4f\x09\x7e\xeaXY\xe1\xc1\xc9'
+
+# ------------------------------------------------------------------------------
+# Disassemble
+
+def disassemble(haystack):
+    i = 0
+    while i < len(haystack):
+        for l in range(2):
+            prefix = haystack[i:i+l+1]
+            if prefix in gbz80.instructions:
+                instr = gbz80.instructions[prefix]
+                break
+        else:
+            raise SyntaxError
+
+        i += len(prefix)
+        inputs = []
+        for inp in instr.inputs:
+            inputs.append(int.from_bytes(
+                haystack[i:i + inp.length], byteorder='little'))
+            i += inp.length
+
+        print(instr(*inputs))
+
+
+# ------------------------------------------------------------------------------
+# Pattern match
+
+def find_code(haystack, needle, **kwargs):
+    # TODO error if something in kwargs isn't a pattern input?
+    # TODO the return value here is goofy
+    # TODO maybe use finditer and yield instead
+    pattern_chunks = []
+    input_table = OrderedDict()
+    matched_instructions = []
+    for instruction in needle.splitlines():
+        instruction = re.sub(';.*', '', instruction).strip()
+        if not instruction:
+            continue
+        mnemonic, args = Instruction.partial_parse(instruction)
+        candidates = gbz80.mnemonics[mnemonic]
+        for candidate in candidates:
+            inputs = candidate.match_inputs(mnemonic, args)
+            if inputs is not None:
+                break
+        else:
+            raise SyntaxError
+
+        instr = candidate
+        pattern_chunks.append(re.escape(instr.prefix))
+        pattern_atoms = []
+        for instr_input, pattern_atom in inputs:
+            pattern_atoms.append(pattern_atom)
+            if pattern_atom.is_constant:
+                pattern_chunks.append(re.escape(
+                    pattern_atom.value.to_bytes(
+                        instr_input.length, byteorder='little')))
+            elif pattern_atom.name in input_table:
+                pattern_chunks.append(input_table[pattern_atom.name])
+            else:
+                if pattern_atom.name in kwargs:
+                    inner_pattern = re.escape(
+                        kwargs[pattern_atom.name].to_bytes(
+                            instr_input.length, byteorder='little'))
+                else:
+                    inner_pattern = b'.' * instr_input.length
+
+                group_name = pattern_atom.name.encode('ascii')
+                input_table[pattern_atom.name] = b'(?P=%b)' % (group_name,)
+                pattern_chunks.append(b'(?P<%b>%b)' % (group_name, inner_pattern))
+        matched_instructions.append((instr, pattern_atoms))
+
+    pattern = b''.join(pattern_chunks)
+
+    m = re.search(pattern, haystack, flags=re.DOTALL)
+    if m:
+        matched_inputs = {}
+        for inp in input_table:
+            matched_inputs[inp] = int.from_bytes(
+                m.group(inp), byteorder='little')
+
+        for instr, pattern_atoms in matched_instructions:
+            inputs = []
+            for atom in pattern_atoms:
+                if atom.is_constant:
+                    inputs.append(atom.value)
+                else:
+                    inputs.append(matched_inputs[atom.name])
+
+        return m, matched_inputs
+    else:
+        return
--- a/pokedex/extract/rby.py
+++ b/pokedex/extract/rby.py
@ -14,13 +14,13 @@ import hashlib
 import io
 import logging
 from pathlib import Path
-import re
 import sys

 from camel import Camel
 from classtools import reify
 from construct import *

+from pokedex.extract.lib.gbz80 import find_code
 import pokedex.schema as schema

 # TODO set this up to colorcode and use {} formatting
@ -1167,8 +1167,6 @@ pokemon_struct = Struct(
    Padding(1),
 )

-pokemon_name_struct = PokemonCString('pokemon_name', 10)
-

 evos_moves_struct = Struct(
    'evos_moves',
@ -1281,28 +1279,17 @@ class RBYCart:
        Return a dict of raw file offsets.  The keys are the names used in the
        pokered project.
        """
-        # The base stats are always in the same place in RBY, and only slightly
-        # off in RG.  Not sure why!  But it hopefully means recompilation
-        # doesn't affect them.
-        addresses = {
-            # These ones have, thusfar, defied automatic detection, as they're
-            # just part of a big old block of data — so I can't just look for
-            # code nearby.
-            # TODO these are for rby; fix for rg, and maybe y?
-            'BaseStats': unbank('0E:43DE'),
-            'MewBaseStats': unbank('01:425B'),
-        }
-
-        # For everything else, the general approach is to find some assembly
-        # code that appears just before the data of interest.  It's pretty
-        # hacky, but since translators (and even modders) would have little
-        # reason to rearrange functions or inject new ones in these odd places,
-        # it ought to work well enough.  And it's better than ferreting out and
-        # hard-coding piles of addresses.
+        # The ideal approach is to find some assembly code that appears just
+        # before the data of interest.  It's pretty hacky, but since
+        # translators (and even modders) would have little reason to rearrange
+        # functions or inject new ones in these odd places, it ought to work
+        # well enough.  And it's better than ferreting out and hard-coding
+        # piles of addresses.
        # The only hard part is that assembly code that contains an address
        # won't work, since that address will also vary per game.
        # Each of the landmarks used here appears in every official cartridge
        # exactly once.
+        addresses = {}

        # This is an entire function used by the Pokédex and which immediately
        # precedes all the flavor text.
@ -1322,6 +1309,72 @@ class RBYCart:
            raise CartDetectionError("Can't find evolution and moveset table")
        addresses['EvosMovesPointerTable'] = idx + len(asm_WriteMonMoves_ShiftMoveData) + 5

+        # Several lists of names are accessed by a single function, which looks
+        # through a list of pointers to find the right set of names to use.
+        # That's great news for me: I can just grab all of those delicious
+        # pointers at once.  Here's an excerpt from GetName.
+        match = find_code(self.data, '''
+            inc d
+            ;.skip
+            ld hl, #NamePointers
+            add hl,de
+            ld a,[hl+]
+            ldh [$96],a
+            ld a,[hl]
+            ldh [$95],a
+            ldh a,[$95]
+            ld h,a
+            ldh a,[$96]
+            ld l,a
+            ld a,[#wd0b5]
+            ld b,a
+            ld c,0
+            ;.nextName
+            ld d,h
+            ld e,l
+            ;.nextChar
+            ld a,[hl+]
+            cp $50  ; terminator @, encoded
+        ''')
+        if not match:
+            raise CartDetectionError("Can't find name array")
+        rem, inputs = match
+        start = inputs['NamePointers']
+        name_pointers = Array(7, ULInt16('dummy')).parse(
+            self.data[start:start + 14])
+        # One downside to the Game Boy memory structure is that banks are
+        # not stored anywhere near their corresponding addresses.  Most
+        # bank numbers are hardcoded here, but Pokémon names are in a different
+        # bank in Japanese games, so we've gotta scrape the bank too...
+        match = find_code(self.data, '''
+        ;GetMonName::
+            push hl
+            ldh a,[#H_LOADEDROMBANK]
+            push af
+            ld a,#BANK_MonsterNames
+            ldh [#H_LOADEDROMBANK],a
+            ld [#MBC1RomBank],a
+            ld a,[#wd11e]
+            dec a
+            ld hl,#MonsterNames
+        ''',
+            H_LOADEDROMBANK=0xB8,  # full address is $FFB8; ldh adds the $FF
+            MBC1RomBank=0x2000,
+            MonsterNames=name_pointers[0]
+        )
+        if not match:
+            raise CartDetectionError("Can't find Pokémon names")
+        rem, inputs = match
+
+        addresses['MonsterNames'] = unbank(
+            inputs['BANK_MonsterNames'], name_pointers[0])
+        addresses['MoveNames'] = unbank(0x2C, name_pointers[1])
+        # 2: UnusedNames  (unused, obviously)
+        addresses['ItemNames'] = unbank(0x01, name_pointers[3])
+        # 4: wPartyMonOT  (only useful while the game is running)
+        # 5: wEnemyMonOT  (only useful while the game is running)
+        addresses['TrainerNames'] = unbank(0x0E, name_pointers[6])
+
        # Finding TMs is a bit harder.  They come right after a function for
        # looking up a TM number, which is very short and very full of
        # addresses.  So here's a regex.
@ -1330,26 +1383,29 @@ class RBYCart:
        # In English it is, unsurprisingly, 0xD11E.
        # `TechnicalMachines` is the address we're looking for, which should
        # immediately follow what this matches.
-        asm_TMToMove_rx = re.compile(rb'''
-            \xfa (..)   # ld a, [wd11e]
-            \x3d        # dec a
-            \x21 (..)   # ld hl, TechnicalMachines
-            \x06 \x00   # ld b, $0
-            \x4f        # ld c, a
-            \x09        # add hl, bc
-            \x7e        # ld a, [hl]
-            \xea \1     # ld [wd11e], a
-            \xc9        # ret
-        ''', flags=re.DOTALL | re.VERBOSE)
-        for match in asm_TMToMove_rx.finditer(self.data):
-            matched_addr = ULInt16('...').parse(match.group(2))
-            tentative_addr = match.end()
+        match = find_code(self.data, '''
+            ld a, [#wd11e]
+            dec a
+            ld hl, #TechnicalMachines
+            ld b, $0
+            ld c, a
+            add hl, bc
+            ld a, [hl]
+            ld [#wd11e], a
+            ret
+        ''')
+        if match:
+            rem, inputs = match
+            # TODO this should mayybe also check that the address immediately follows this code
+            matched_addr = inputs['TechnicalMachines']
+            tentative_addr = rem.end()
            # Remember, addresses don't include the bank!
            _, banked_addr = bank(tentative_addr)
            if matched_addr == banked_addr:
-                asm_wd11e_addr = match.group(1)
+                asm_wd11e_addr = inputs['wd11e']
                addresses['TechnicalMachines'] = tentative_addr
-                break
+            else:
+                raise RuntimeError
            # TODO should there really be more than one match?
        else:
            raise CartDetectionError("Can't find technical machines list")
@ -1359,61 +1415,107 @@ class RBYCart:
        # These are almost immediately after the Pokédex entries themselves,
        # but this actually seems easier than figuring out where a table of
        # pointers ends.
-        asm_IndexToPokedex_rx = re.compile(rb'''
-            \xc5        # push bc
-            \xe5        # push hl
-            \xfa (..)   # ld a,[wd11e]
-            \x3d        # dec a
-            \x21 (..)   # ld hl,PokedexOrder
-            \x06 \x00   # ld b,0
-            \x4f        # ld c,a
-            \x09        # add hl,bc
-            \x7e        # ld a,[hl]
-            \xea \1     # ld [wd11e],a
-            \xe1        # pop hl
-            \xc1        # pop bc
-            \xc9        # ret
-        ''', flags=re.DOTALL | re.VERBOSE)
-        for match in asm_IndexToPokedex_rx.finditer(self.data):
-            matched_addr = ULInt16('...').parse(match.group(2))
-            tentative_addr = match.end()
+        match = find_code(self.data, '''
+            push bc
+            push hl
+            ld a, [#wd11e]
+            dec a
+            ld hl, #PokedexOrder
+            ld b, 0
+            ld c, a
+            add hl, bc
+            ld a, [hl]
+            ld [#wd11e], a
+            pop hl
+            pop bc
+            ret
+        ''', wd11e=asm_wd11e_addr)
+        if match:
+            rem, inputs = match
+            matched_addr = inputs['PokedexOrder']
+            tentative_addr = rem.end()
            # Remember, addresses don't include the bank!
            _, banked_addr = bank(tentative_addr)
-            if matched_addr == banked_addr and asm_wd11e_addr == match.group(1):
+            if matched_addr == banked_addr:
                addresses['PokedexOrder'] = tentative_addr
-                break
+            else:
+                raise RuntimeError
        else:
            raise CartDetectionError("Can't find Pokédex order")

-        # This is assembly code that appears near the end of a function called
-        # WaitForSoundToFinish.
-        end_of_WaitForSoundToFinish = bytes.fromhex('afb6 23b6 2323 b6')
-        try:
-            idx = self.data.index(end_of_WaitForSoundToFinish)
-        except ValueError:
-            raise CartDetectionError("Can't find name array")
-        # There are a couple more bytes in the function, but they involve an
-        # address so they can't be searched for.  Red/Green/Blue have four;
-        # Yellow has an extra 'and', which is annoying, but at least easy to
-        # handle.
-        start = idx + len(end_of_WaitForSoundToFinish)
-        if self.data[start] == 0xA7:
-            # Yellow; skip one more byte
-            start += 1
-        start += 4
-
-        name_pointers = Array(7, ULInt16('dummy')).parse(self.data[start:start + 14])
-        # One downside to the Game Boy memory structure is that banks are not
-        # stored anywhere near their corresponding addresses, so the bank
-        # numbers are hardcoded here.  They're fairly unlikely to change
-        # between games.  Right?  Probably?
-        addresses['MonsterNames'] = unbank(0x07, name_pointers[0])
-        addresses['MoveNames'] = unbank(0x2C, name_pointers[1])
-        # 2: UnusedNames  (unused, obviously)
-        addresses['ItemNames'] = unbank(0x01, name_pointers[3])
-        # 4: wPartyMonOT  (only useful while the game is running)
-        # 5: wEnemyMonOT  (only useful while the game is running)
-        addresses['TrainerNames'] = unbank(0x0E, name_pointers[6])
+        # Ah, but then, we have base stats.  These don't have code nearby;
+        # they're just stuck immediately after moves.  Except in R/G, where
+        # they appear /before/ moves!  And we don't know what version we're
+        # running yet, because the addresses detected in this method are used
+        # for language detection.  Hmm.
+        # Here's plan B: look for the function that /loads/ base stats, and
+        # scrape the address out of it.  This function is a bit hairy; I've had
+        # to expand some of pokered's macros and rewrite the jumps to something
+        # that the rudimentary code matcher can understand.
+        match = find_code(self.data, '''
+            ldh a, [#H_LOADEDROMBANK]
+            push af
+            ld a, #BANK_BaseStats
+            ldh [#H_LOADEDROMBANK], a
+            ld [#MBC1RomBank], a
+            push bc
+            push de
+            push hl
+            ld a, [#wd11e]
+            push af
+            ld a,[#wd0b5]
+            ld [#wd11e],a
+            ld de,#FossilKabutopsPic
+            ld b,$66 ; size of Kabutops fossil and Ghost sprites
+            cp #FOSSIL_KABUTOPS ; Kabutops fossil
+            jr z,#specialID1
+            ld de,#GhostPic
+            cp #MON_GHOST ; Ghost
+            jr z,#specialID2
+            ld de,#FossilAerodactylPic
+            ld b,$77 ; size of Aerodactyl fossil sprite
+            cp #FOSSIL_AERODACTYL ; Aerodactyl fossil
+            jr z,#specialID3
+            cp #MEW
+            jr z,#mew
+            ld a, #IndexToPokedexPredef
+            call #IndexToPokedex   ; convert pokemon ID in [wd11e] to pokedex number
+            ld a,[#wd11e]
+            dec a
+            ld bc, #MonBaseStatsLength
+            ld hl, #BaseStats
+            call #AddNTimes
+            ld de, #wMonHeader
+            ld bc, #MonBaseStatsLength
+            call #CopyData
+            jr #done1
+            ;.specialID
+            ld hl, #wMonHSpriteDim
+            ld [hl], b ; write sprite dimensions
+            inc hl
+            ld [hl], e ; write front sprite pointer
+            inc hl
+            ld [hl], d
+            jr #done2
+            ;.mew
+            ld hl, #MewBaseStats
+            ld de, #wMonHeader
+            ld bc, #MonBaseStatsLength
+            ld a, #BANK_MewBaseStats
+            call #FarCopyData
+        ''',
+            # These are constants; I left them in the above code for clarity
+            H_LOADEDROMBANK=0xB8,  # full address is $FFB8; ldh adds the $FF
+            MBC1RomBank=0x2000,
+            # This was scraped previously
+            wd11e=asm_wd11e_addr,
+        )
+        if match:
+            rem, inputs = match
+            addresses['BaseStats'] = unbank(inputs['BANK_BaseStats'], inputs['BaseStats'])
+            addresses['MewBaseStats'] = unbank(inputs['BANK_MewBaseStats'], inputs['MewBaseStats'])
+        else:
+            raise CartDetectionError("Can't find base stats")

        return addresses

@ -1612,7 +1714,12 @@ class RBYCart:
        ret = [None] * self.NUM_POKEMON

        self.stream.seek(self.addrs['MonsterNames'])
-        for index, pokemon_name in enumerate(Array(self.max_pokemon_index, pokemon_name_struct).parse_stream(self.stream), start=1):
+        # TODO i don't like this, but they don't have explicit terminators...
+        if self.language == 'ja':
+            name_length = 5
+        else:
+            name_length = 10
+        for index, pokemon_name in enumerate(Array(self.max_pokemon_index, PokemonCString('...', name_length)).parse_stream(self.stream), start=1):
            try:
                id = self.pokedex_order[index]
            except KeyError:
@ -1631,7 +1738,6 @@ class RBYCart:
    def pokemon_records(self):
        """List of pokemon_structs."""
        self.stream.seek(self.addrs['BaseStats'])
-        print(self.stream.read(100).hex())
        records = Array(self.NUM_POKEMON - 1, pokemon_struct).parse_stream(self.stream)
        # Mew's data is, awkwardly, stored separately
        self.stream.seek(self.addrs['MewBaseStats'])