From 122da8d885f0881197260923179abc010d4cd9eb Mon Sep 17 00:00:00 2001 From: "Eevee (Lexy Munroe)" Date: Sun, 21 Aug 2016 16:44:07 -0700 Subject: [PATCH] Write a gbz80 pattern matcher, for more robust address sniffing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this, Pokémon names are finally detected correctly from both R/G and R/B. Hurrah. Yellow doesn't work yet, though. Sigh. --- pokedex/extract/lib/gbz80.py | 647 +++++++++++++++++++++++++++++++++++ pokedex/extract/rby.py | 282 ++++++++++----- 2 files changed, 841 insertions(+), 88 deletions(-) create mode 100644 pokedex/extract/lib/gbz80.py diff --git a/pokedex/extract/lib/gbz80.py b/pokedex/extract/lib/gbz80.py new file mode 100644 index 0000000..59b64b6 --- /dev/null +++ b/pokedex/extract/lib/gbz80.py @@ -0,0 +1,647 @@ +"""Stuff for dealing with the Game Boy's Z80-ish machine code. Most notably, +can do pattern-matching against a chunk of assembly with missing values. +""" +from collections import OrderedDict +from collections import defaultdict +import re + +import attr + +# TODO: would be nice to understand "cp a, #foo" and similar for xor/or/and/etc +# TODO: would be AMAZING to understand labels when searching for code wow + +# This table is courtesy of pokemon-reverse-engineering-tools: +# https://github.com/pret/pokemon-reverse-engineering-tools/blob/master/pokemontools/gbz80disasm.py +gbz80_bitops = dict(enumerate([ + "rlc b", "rlc c", "rlc d", "rlc e", "rlc h", "rlc l", "rlc [hl]", "rlc a", # $00 - $07 + "rrc b", "rrc c", "rrc d", "rrc e", "rrc h", "rrc l", "rrc [hl]", "rrc a", # $08 - $0f + "rl b", "rl c", "rl d", "rl e", "rl h", "rl l", "rl [hl]", "rl a", # $10 - $17 + "rr b", "rr c", "rr d", "rr e", "rr h", "rr l", "rr [hl]", "rr a", # $18 - $1f + "sla b", "sla c", "sla d", "sla e", "sla h", "sla l", "sla [hl]", "sla a", # $20 - $27 + "sra b", "sra c", "sra d", "sra e", "sra h", "sra l", "sra [hl]", "sra a", # $28 - $2f + "swap b", "swap c", "swap d", "swap e", "swap h", "swap l", "swap [hl]", "swap a", # $30 - $37 + "srl b", "srl c", "srl d", "srl e", "srl h", "srl l", "srl [hl]", "srl a", # $38 - $3f + "bit $00, b", "bit $00, c", "bit $00, d", "bit $00, e", "bit $00, h", "bit $00, l", "bit $00, [hl]", "bit $00, a", # $40 - $47 + "bit $01, b", "bit $01, c", "bit $01, d", "bit $01, e", "bit $01, h", "bit $01, l", "bit $01, [hl]", "bit $01, a", # $48 - $4f + "bit $02, b", "bit $02, c", "bit $02, d", "bit $02, e", "bit $02, h", "bit $02, l", "bit $02, [hl]", "bit $02, a", # $50 - $57 + "bit $03, b", "bit $03, c", "bit $03, d", "bit $03, e", "bit $03, h", "bit $03, l", "bit $03, [hl]", "bit $03, a", # $58 - $5f + "bit $04, b", "bit $04, c", "bit $04, d", "bit $04, e", "bit $04, h", "bit $04, l", "bit $04, [hl]", "bit $04, a", # $60 - $67 + "bit $05, b", "bit $05, c", "bit $05, d", "bit $05, e", "bit $05, h", "bit $05, l", "bit $05, [hl]", "bit $05, a", # $68 - $6f + "bit $06, b", "bit $06, c", "bit $06, d", "bit $06, e", "bit $06, h", "bit $06, l", "bit $06, [hl]", "bit $06, a", # $70 - $77 + "bit $07, b", "bit $07, c", "bit $07, d", "bit $07, e", "bit $07, h", "bit $07, l", "bit $07, [hl]", "bit $07, a", # $78 - $7f + "res $00, b", "res $00, c", "res $00, d", "res $00, e", "res $00, h", "res $00, l", "res $00, [hl]", "res $00, a", # $80 - $87 + "res $01, b", "res $01, c", "res $01, d", "res $01, e", "res $01, h", "res $01, l", "res $01, [hl]", "res $01, a", # $88 - $8f + "res $02, b", "res $02, c", "res $02, d", "res $02, e", "res $02, h", "res $02, l", "res $02, [hl]", "res $02, a", # $90 - $97 + "res $03, b", "res $03, c", "res $03, d", "res $03, e", "res $03, h", "res $03, l", "res $03, [hl]", "res $03, a", # $98 - $9f + "res $04, b", "res $04, c", "res $04, d", "res $04, e", "res $04, h", "res $04, l", "res $04, [hl]", "res $04, a", # $a0 - $a7 + "res $05, b", "res $05, c", "res $05, d", "res $05, e", "res $05, h", "res $05, l", "res $05, [hl]", "res $05, a", # $a8 - $af + "res $06, b", "res $06, c", "res $06, d", "res $06, e", "res $06, h", "res $06, l", "res $06, [hl]", "res $06, a", # $b0 - $b7 + "res $07, b", "res $07, c", "res $07, d", "res $07, e", "res $07, h", "res $07, l", "res $07, [hl]", "res $07, a", # $b8 - $bf + "set $00, b", "set $00, c", "set $00, d", "set $00, e", "set $00, h", "set $00, l", "set $00, [hl]", "set $00, a", # $c0 - $c7 + "set $01, b", "set $01, c", "set $01, d", "set $01, e", "set $01, h", "set $01, l", "set $01, [hl]", "set $01, a", # $c8 - $cf + "set $02, b", "set $02, c", "set $02, d", "set $02, e", "set $02, h", "set $02, l", "set $02, [hl]", "set $02, a", # $d0 - $d7 + "set $03, b", "set $03, c", "set $03, d", "set $03, e", "set $03, h", "set $03, l", "set $03, [hl]", "set $03, a", # $d8 - $df + "set $04, b", "set $04, c", "set $04, d", "set $04, e", "set $04, h", "set $04, l", "set $04, [hl]", "set $04, a", # $e0 - $e7 + "set $05, b", "set $05, c", "set $05, d", "set $05, e", "set $05, h", "set $05, l", "set $05, [hl]", "set $05, a", # $e8 - $ef + "set $06, b", "set $06, c", "set $06, d", "set $06, e", "set $06, h", "set $06, l", "set $06, [hl]", "set $06, a", # $f0 - $f7 + "set $07, b", "set $07, c", "set $07, d", "set $07, e", "set $07, h", "set $07, l", "set $07, [hl]", "set $07, a" # $f8 - $ff +])) + +# This instruction list was carefully scraped from: +# http://www.pastraiser.com/cpu/gameboy/gameboy_opcodes.html +gbz80_instructions = { + 0x00: 'nop', + 0x01: 'ld bc, #d16', + 0x02: 'ld [bc], a', + 0x03: 'inc bc', + 0x04: 'inc b', + 0x05: 'dec b', + 0x06: 'ld b, #d8', + 0x07: 'rlca', + 0x08: 'ld [#a16], sp', + 0x09: 'add hl, bc', + 0x0a: 'ld a, [bc]', + 0x0b: 'dec bc', + 0x0c: 'inc c', + 0x0d: 'dec c', + 0x0e: 'ld c, #d8', + 0x0f: 'rrca', + + 0x10: 'stop', + 0x11: 'ld de, #d16', + 0x12: 'ld [de], a', + 0x13: 'inc de', + 0x14: 'inc d', + 0x15: 'dec d', + 0x16: 'ld d, #d8', + 0x17: 'rla', + 0x18: 'jr #r8', + 0x19: 'add hl, de', + 0x1a: 'ld a, [de]', + 0x1b: 'dec de', + 0x1c: 'inc e', + 0x1d: 'dec e', + 0x1e: 'ld e, #d8', + 0x1f: 'rra', + + 0x20: 'jr nz, #r8', + 0x21: 'ld hl, #d16', + 0x22: 'ld [hl+], a', + 0x23: 'inc hl', + 0x24: 'inc h', + 0x25: 'dec h', + 0x26: 'ld h, #d8', + 0x27: 'daa', + 0x28: 'jr z, #r8', + 0x29: 'add hl, hl', + 0x2a: 'ld a, [hl+]', + 0x2b: 'dec hl', + 0x2c: 'inc l', + 0x2d: 'dec l', + 0x2e: 'ld l, #d8', + 0x2f: 'cpl', + + 0x30: 'jr nc, #r8', + 0x31: 'ld sp, #d16', + 0x32: 'ld [hl-], a', + 0x33: 'inc sp', + 0x34: 'inc [hl]', + 0x35: 'dec [hl]', + 0x36: 'ld [hl], #d8', + 0x37: 'scf', + 0x38: 'jr c, #r8', + 0x39: 'add hl, sp', + 0x3a: 'ld a, [hl-]', + 0x3b: 'dec sp', + 0x3c: 'inc a', + 0x3d: 'dec a', + 0x3e: 'ld a, #d8', + 0x3f: 'ccf', + + 0x40: 'ld b, b', + 0x41: 'ld b, c', + 0x42: 'ld b, d', + 0x43: 'ld b, e', + 0x44: 'ld b, h', + 0x45: 'ld b, l', + 0x46: 'ld b, [hl]', + 0x47: 'ld b, a', + 0x48: 'ld c, b', + 0x49: 'ld c, c', + 0x4a: 'ld c, d', + 0x4b: 'ld c, e', + 0x4c: 'ld c, h', + 0x4d: 'ld c, l', + 0x4e: 'ld c, [hl]', + 0x4f: 'ld c, a', + + 0x50: 'ld d, b', + 0x51: 'ld d, c', + 0x52: 'ld d, d', + 0x53: 'ld d, e', + 0x54: 'ld d, h', + 0x55: 'ld d, l', + 0x56: 'ld d, [hl]', + 0x57: 'ld d, a', + 0x58: 'ld e, b', + 0x59: 'ld e, c', + 0x5a: 'ld e, d', + 0x5b: 'ld e, e', + 0x5c: 'ld e, h', + 0x5d: 'ld e, l', + 0x5e: 'ld e, [hl]', + 0x5f: 'ld e, a', + + 0x60: 'ld h, b', + 0x61: 'ld h, c', + 0x62: 'ld h, d', + 0x63: 'ld h, e', + 0x64: 'ld h, h', + 0x65: 'ld h, l', + 0x66: 'ld h, [hl]', + 0x67: 'ld h, a', + 0x68: 'ld l, b', + 0x69: 'ld l, c', + 0x6a: 'ld l, d', + 0x6b: 'ld l, e', + 0x6c: 'ld l, h', + 0x6d: 'ld l, l', + 0x6e: 'ld l, [hl]', + 0x6f: 'ld l, a', + + 0x70: 'ld [hl], b', + 0x71: 'ld [hl], c', + 0x72: 'ld [hl], d', + 0x73: 'ld [hl], e', + 0x74: 'ld [hl], h', + 0x75: 'ld [hl], l', + 0x76: 'halt', + 0x77: 'ld [hl], a', + 0x78: 'ld a, b', + 0x79: 'ld a, c', + 0x7a: 'ld a, d', + 0x7b: 'ld a, e', + 0x7c: 'ld a, h', + 0x7d: 'ld a, l', + 0x7e: 'ld a, [hl]', + 0x7f: 'ld a, a', + + 0x80: 'add a, b', + 0x81: 'add a, c', + 0x82: 'add a, d', + 0x83: 'add a, e', + 0x84: 'add a, h', + 0x85: 'add a, l', + 0x86: 'add a, [hl]', + 0x87: 'add a, a', + 0x88: 'adc a, b', + 0x89: 'adc a, c', + 0x8a: 'adc a, d', + 0x8b: 'adc a, e', + 0x8c: 'adc a, h', + 0x8d: 'adc a, l', + 0x8e: 'adc a, [hl]', + 0x8f: 'adc a, a', + + 0x90: 'sub b', + 0x91: 'sub c', + 0x92: 'sub d', + 0x93: 'sub e', + 0x94: 'sub h', + 0x95: 'sub l', + 0x96: 'sub [hl]', + 0x97: 'sub a', + 0x98: 'sbc a, b', + 0x99: 'sbc a, c', + 0x9a: 'sbc a, d', + 0x9b: 'sbc a, e', + 0x9c: 'sbc a, h', + 0x9d: 'sbc a, l', + 0x9e: 'sbc a, [hl]', + 0x9f: 'sbc a, a', + + 0xa0: 'and b', + 0xa1: 'and c', + 0xa2: 'and d', + 0xa3: 'and e', + 0xa4: 'and h', + 0xa5: 'and l', + 0xa6: 'and [hl]', + 0xa7: 'and a', + 0xa8: 'xor b', + 0xa9: 'xor c', + 0xaa: 'xor d', + 0xab: 'xor e', + 0xac: 'xor h', + 0xad: 'xor l', + 0xae: 'xor [hl]', + 0xaf: 'xor a', + + 0xb0: 'or b', + 0xb1: 'or c', + 0xb2: 'or d', + 0xb3: 'or e', + 0xb4: 'or h', + 0xb5: 'or l', + 0xb6: 'or [hl]', + 0xb7: 'or a', + 0xb8: 'cp b', + 0xb9: 'cp c', + 0xba: 'cp d', + 0xbb: 'cp e', + 0xbc: 'cp h', + 0xbd: 'cp l', + 0xbe: 'cp [hl]', + 0xbf: 'cp a', + + 0xc0: 'ret nz', + 0xc1: 'pop bc', + 0xc2: 'jp nz, #a16', + 0xc3: 'jp #a16', + 0xc4: 'call nz, #a16', + 0xc5: 'push bc', + 0xc6: 'add a, #d8', + 0xc7: 'rst $00', + 0xc8: 'ret z', + 0xc9: 'ret', + 0xca: 'jp z, #a16', + 0xcb: gbz80_bitops, + 0xcc: 'call z, #a16', + 0xcd: 'call #a16', + 0xce: 'adc a, #d8', + 0xcf: 'rst $08', + + 0xd0: 'ret nc', + 0xd1: 'pop de', + 0xd2: 'jp nc, #a16', + # 0xd3 + 0xd4: 'call nc, #a16', + 0xd5: 'push de', + 0xd6: 'sub #d8', + 0xd7: 'rst $10', + 0xd8: 'ret c', + 0xd9: 'reti', + 0xda: 'jp c, #a16', + # 0xdb + 0xdc: 'call c, #a16', + # 0xdd + 0xde: 'sbc a, #d8', + 0xdf: 'rst $18', + + 0xe0: 'ldh [#a8], a', + 0xe1: 'pop hl', + 0xe2: 'ld [$ff00+c], a', # XXX table claims 2 but this looks like 1 to me + # 0xe3 + # 0xe4 + 0xe5: 'push hl', + 0xe6: 'and #d8', + 0xe7: 'rst $20', + 0xe8: 'add sp, #r8', + 0xe9: 'jp [hl]', + 0xea: 'ld [#a16], a', + # 0xeb + # 0xec + # 0xed + 0xee: 'xor #d8', + 0xef: 'rst $28', + + 0xf0: 'ldh a, [#a8]', + 0xf1: 'pop af', + 0xf2: 'ld a, [$ff00+c]', # XXX table says 1 but this looks like 1 to me + 0xf3: 'di', + # 0xf4 + 0xf5: 'push af', + 0xf6: 'or #d8', + 0xf7: 'rst $30', + 0xf8: 'ld hl, sp+#r8', + 0xf9: 'ld sp, hl', + 0xfa: 'ld a, [#a16]', + 0xfb: 'ei', + # 0xfc + # 0xfd + 0xfe: 'cp #d8', + 0xff: 'rst $38', +} + + +class Atom: + is_constant = False + is_input = False + is_register = False + + def is_compatible_with(self, other): + return self == other + + def render(self): + raise NotImplementedError + + +@attr.s +class InputAtom(Atom): + name = attr.ib() + length = attr.ib() + is_input = True + + def is_compatible_with(self, other): + # Inputs are compatible with anything that's not a register + # TODO does length matter? + return isinstance(other, (InputAtom, ConstantAtom)) + + def render(self, value=None): + if value is None: + return '#' + self.name + else: + return "${:0{width}x}".format(value, width=self.length * 2) + + +@attr.s +class RegisterAtom(Atom): + name = attr.ib() + is_register = True + + def render(self): + return self.name + + +@attr.s +class ConstantAtom(Atom): + value = attr.ib() + is_constant = True + + def render(self): + if self.value < 256: + return "${:02x}".format(self.value) + else: + return "${:04x}".format(self.value) + + +class Instruction: + def __init__(self, syntax, prefix, mnemonic, length, args, inputs): + self.syntax = syntax + self.prefix = prefix + self.mnemonic = mnemonic + self.length = length + # list of (items_to_sum, is_pointer) tuples + self.args = args + self.inputs = inputs + + @staticmethod + def partial_parse(syntax): + mnemonic, *argstrs = re.split('[, ]+', syntax) + args = [] + + for argstr in argstrs: + ptr = False + if argstr.startswith('[') and argstr.endswith(']'): + ptr = True + argstr = argstr[1:-1] + + atoms = [] + for atomstr in re.split('[+](?=.)', argstr): + if atomstr.startswith('#'): + atom = InputAtom(atomstr[1:], None) + elif atomstr in ( + 'a', 'f', 'b', 'c', 'd', 'e', 'h', 'l', + 'af', 'bc', 'de', 'hl', 'hl+', 'hl-', + 'sp', 'pc', 'z', 'nz', 'c', 'nc', + ): + atom = RegisterAtom(atomstr) + elif atomstr.startswith('$'): + atom = ConstantAtom(int(atomstr[1:], 16)) + elif atomstr.isdigit(): + atom = ConstantAtom(int(atomstr)) + else: + raise SyntaxError( + "Unrecognized argument {!r} in instruction {!r}" + .format(atomstr, syntax)) + + atoms.append(atom) + + args.append((atoms, ptr)) + + return mnemonic, args + + @classmethod + def parse(cls, syntax, prefix): + mnemonic, args = cls.partial_parse(syntax) + inputs = [] + + length = len(prefix) + for atoms, is_ptr in args: + for atom in atoms: + if not atom.is_input: + continue + if atom.name in ('d16', 'a16'): + atom.length = 2 + elif atom.name in ('d8', 'a8', 'r8'): + atom.length = 1 + else: + raise SyntaxError( + "Unrecognized input name {}".format(atom.name)) + + inputs.append(atom) + length += atom.length + + self = cls(syntax, prefix, mnemonic, length, args, inputs) + assert self(ignore_inputs=True) == syntax + return self + + def __repr__(self): + return "<{} 0x{}: {}>".format( + type(self).__name__, + self.prefix.hex(), + self(ignore_inputs=True), + ) + + def __call__(self, *inputs, ignore_inputs=False): + inputs = list(inputs) + if not ignore_inputs: + if len(inputs) != len(self.inputs): + raise TypeError( + "{} needs {} inputs, got {}" + .format(self.syntax, len(self.inputs), len(inputs))) + + args = [] + for atoms, is_ptr in self.args: + atomstrs = [] + for atom in atoms: + if not ignore_inputs and atom.is_input: + atomstrs.append(atom.render(inputs.pop(0))) + else: + atomstrs.append(atom.render()) + expr = '+'.join(atomstrs) + if is_ptr: + expr = '[' + expr + ']' + args.append(expr) + + out = self.mnemonic + if args: + out = out + ' ' + ', '.join(args) + return out + + def match_inputs(self, mnemonic, args): + if self.mnemonic != mnemonic: + return + if len(self.args) != len(args): + return + + # Compare args + input_pairs = [] + for (atoms1, ptr1), (atoms2, ptr2) in zip(self.args, args): + if ptr1 != ptr2: + return + if len(atoms1) != len(atoms2): + return + # TODO technically, A+B is the same as B+A, but the lists are het + # so i can't sort + # TODO also, constant folding could make A+B the same as C + for atom1, atom2 in zip(atoms1, atoms2): + if not atom1.is_compatible_with(atom2): + return + if atom1.is_input: + input_pairs.append((atom1, atom2)) + + return input_pairs + + +class InstructionSet: + def __init__(self, instructions): + self.instructions = {} + self.mnemonics = defaultdict(set) + self._load_instructions(instructions) + + def _load_instructions(self, instructions, *, prefix=b''): + for n, syntax in instructions.items(): + byte = bytes([n]) + if isinstance(syntax, dict): + # Nested args, for the bitops instruction + self._load_instructions(syntax, prefix=byte) + else: + instr = Instruction.parse(syntax, prefix + byte) + self.instructions[prefix + byte] = instr + self.mnemonics[instr.mnemonic].add(instr) + + +gbz80 = InstructionSet(gbz80_instructions) + +needle = """push bc +push hl +ld a, [#wd11e] +dec a +ld hl, #PokedexOrder +ld b, 0 +ld c, a +add hl, bc +ld a, [hl] +ld [#wd11e], a +pop hl +pop bc +ret""" +""" + \xc5 + \xe5 + \xfa (..) + \x3d + \x21 (..) + \x06 \x00 + \x4f + \x09 + \x7e + \xea \1 + \xe1 + \xc1 + \xc9 + """ +haystack = b'\xc5\xe5\xfaXY\x3d\x21ZW\x06\x00\x4f\x09\x7e\xeaXY\xe1\xc1\xc9' + +# ------------------------------------------------------------------------------ +# Disassemble + +def disassemble(haystack): + i = 0 + while i < len(haystack): + for l in range(2): + prefix = haystack[i:i+l+1] + if prefix in gbz80.instructions: + instr = gbz80.instructions[prefix] + break + else: + raise SyntaxError + + i += len(prefix) + inputs = [] + for inp in instr.inputs: + inputs.append(int.from_bytes( + haystack[i:i + inp.length], byteorder='little')) + i += inp.length + + print(instr(*inputs)) + + +# ------------------------------------------------------------------------------ +# Pattern match + +def find_code(haystack, needle, **kwargs): + # TODO error if something in kwargs isn't a pattern input? + # TODO the return value here is goofy + # TODO maybe use finditer and yield instead + pattern_chunks = [] + input_table = OrderedDict() + matched_instructions = [] + for instruction in needle.splitlines(): + instruction = re.sub(';.*', '', instruction).strip() + if not instruction: + continue + mnemonic, args = Instruction.partial_parse(instruction) + candidates = gbz80.mnemonics[mnemonic] + for candidate in candidates: + inputs = candidate.match_inputs(mnemonic, args) + if inputs is not None: + break + else: + raise SyntaxError + + instr = candidate + pattern_chunks.append(re.escape(instr.prefix)) + pattern_atoms = [] + for instr_input, pattern_atom in inputs: + pattern_atoms.append(pattern_atom) + if pattern_atom.is_constant: + pattern_chunks.append(re.escape( + pattern_atom.value.to_bytes( + instr_input.length, byteorder='little'))) + elif pattern_atom.name in input_table: + pattern_chunks.append(input_table[pattern_atom.name]) + else: + if pattern_atom.name in kwargs: + inner_pattern = re.escape( + kwargs[pattern_atom.name].to_bytes( + instr_input.length, byteorder='little')) + else: + inner_pattern = b'.' * instr_input.length + + group_name = pattern_atom.name.encode('ascii') + input_table[pattern_atom.name] = b'(?P=%b)' % (group_name,) + pattern_chunks.append(b'(?P<%b>%b)' % (group_name, inner_pattern)) + matched_instructions.append((instr, pattern_atoms)) + + pattern = b''.join(pattern_chunks) + + m = re.search(pattern, haystack, flags=re.DOTALL) + if m: + matched_inputs = {} + for inp in input_table: + matched_inputs[inp] = int.from_bytes( + m.group(inp), byteorder='little') + + for instr, pattern_atoms in matched_instructions: + inputs = [] + for atom in pattern_atoms: + if atom.is_constant: + inputs.append(atom.value) + else: + inputs.append(matched_inputs[atom.name]) + + return m, matched_inputs + else: + return diff --git a/pokedex/extract/rby.py b/pokedex/extract/rby.py index a4fd160..219be0e 100644 --- a/pokedex/extract/rby.py +++ b/pokedex/extract/rby.py @@ -14,13 +14,13 @@ import hashlib import io import logging from pathlib import Path -import re import sys from camel import Camel from classtools import reify from construct import * +from pokedex.extract.lib.gbz80 import find_code import pokedex.schema as schema # TODO set this up to colorcode and use {} formatting @@ -1167,8 +1167,6 @@ pokemon_struct = Struct( Padding(1), ) -pokemon_name_struct = PokemonCString('pokemon_name', 10) - evos_moves_struct = Struct( 'evos_moves', @@ -1281,28 +1279,17 @@ class RBYCart: Return a dict of raw file offsets. The keys are the names used in the pokered project. """ - # The base stats are always in the same place in RBY, and only slightly - # off in RG. Not sure why! But it hopefully means recompilation - # doesn't affect them. - addresses = { - # These ones have, thusfar, defied automatic detection, as they're - # just part of a big old block of data — so I can't just look for - # code nearby. - # TODO these are for rby; fix for rg, and maybe y? - 'BaseStats': unbank('0E:43DE'), - 'MewBaseStats': unbank('01:425B'), - } - - # For everything else, the general approach is to find some assembly - # code that appears just before the data of interest. It's pretty - # hacky, but since translators (and even modders) would have little - # reason to rearrange functions or inject new ones in these odd places, - # it ought to work well enough. And it's better than ferreting out and - # hard-coding piles of addresses. + # The ideal approach is to find some assembly code that appears just + # before the data of interest. It's pretty hacky, but since + # translators (and even modders) would have little reason to rearrange + # functions or inject new ones in these odd places, it ought to work + # well enough. And it's better than ferreting out and hard-coding + # piles of addresses. # The only hard part is that assembly code that contains an address # won't work, since that address will also vary per game. # Each of the landmarks used here appears in every official cartridge # exactly once. + addresses = {} # This is an entire function used by the Pokédex and which immediately # precedes all the flavor text. @@ -1322,6 +1309,72 @@ class RBYCart: raise CartDetectionError("Can't find evolution and moveset table") addresses['EvosMovesPointerTable'] = idx + len(asm_WriteMonMoves_ShiftMoveData) + 5 + # Several lists of names are accessed by a single function, which looks + # through a list of pointers to find the right set of names to use. + # That's great news for me: I can just grab all of those delicious + # pointers at once. Here's an excerpt from GetName. + match = find_code(self.data, ''' + inc d + ;.skip + ld hl, #NamePointers + add hl,de + ld a,[hl+] + ldh [$96],a + ld a,[hl] + ldh [$95],a + ldh a,[$95] + ld h,a + ldh a,[$96] + ld l,a + ld a,[#wd0b5] + ld b,a + ld c,0 + ;.nextName + ld d,h + ld e,l + ;.nextChar + ld a,[hl+] + cp $50 ; terminator @, encoded + ''') + if not match: + raise CartDetectionError("Can't find name array") + rem, inputs = match + start = inputs['NamePointers'] + name_pointers = Array(7, ULInt16('dummy')).parse( + self.data[start:start + 14]) + # One downside to the Game Boy memory structure is that banks are + # not stored anywhere near their corresponding addresses. Most + # bank numbers are hardcoded here, but Pokémon names are in a different + # bank in Japanese games, so we've gotta scrape the bank too... + match = find_code(self.data, ''' + ;GetMonName:: + push hl + ldh a,[#H_LOADEDROMBANK] + push af + ld a,#BANK_MonsterNames + ldh [#H_LOADEDROMBANK],a + ld [#MBC1RomBank],a + ld a,[#wd11e] + dec a + ld hl,#MonsterNames + ''', + H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF + MBC1RomBank=0x2000, + MonsterNames=name_pointers[0] + ) + if not match: + raise CartDetectionError("Can't find Pokémon names") + rem, inputs = match + + addresses['MonsterNames'] = unbank( + inputs['BANK_MonsterNames'], name_pointers[0]) + addresses['MoveNames'] = unbank(0x2C, name_pointers[1]) + # 2: UnusedNames (unused, obviously) + addresses['ItemNames'] = unbank(0x01, name_pointers[3]) + # 4: wPartyMonOT (only useful while the game is running) + # 5: wEnemyMonOT (only useful while the game is running) + addresses['TrainerNames'] = unbank(0x0E, name_pointers[6]) + # Finding TMs is a bit harder. They come right after a function for # looking up a TM number, which is very short and very full of # addresses. So here's a regex. @@ -1330,26 +1383,29 @@ class RBYCart: # In English it is, unsurprisingly, 0xD11E. # `TechnicalMachines` is the address we're looking for, which should # immediately follow what this matches. - asm_TMToMove_rx = re.compile(rb''' - \xfa (..) # ld a, [wd11e] - \x3d # dec a - \x21 (..) # ld hl, TechnicalMachines - \x06 \x00 # ld b, $0 - \x4f # ld c, a - \x09 # add hl, bc - \x7e # ld a, [hl] - \xea \1 # ld [wd11e], a - \xc9 # ret - ''', flags=re.DOTALL | re.VERBOSE) - for match in asm_TMToMove_rx.finditer(self.data): - matched_addr = ULInt16('...').parse(match.group(2)) - tentative_addr = match.end() + match = find_code(self.data, ''' + ld a, [#wd11e] + dec a + ld hl, #TechnicalMachines + ld b, $0 + ld c, a + add hl, bc + ld a, [hl] + ld [#wd11e], a + ret + ''') + if match: + rem, inputs = match + # TODO this should mayybe also check that the address immediately follows this code + matched_addr = inputs['TechnicalMachines'] + tentative_addr = rem.end() # Remember, addresses don't include the bank! _, banked_addr = bank(tentative_addr) if matched_addr == banked_addr: - asm_wd11e_addr = match.group(1) + asm_wd11e_addr = inputs['wd11e'] addresses['TechnicalMachines'] = tentative_addr - break + else: + raise RuntimeError # TODO should there really be more than one match? else: raise CartDetectionError("Can't find technical machines list") @@ -1359,61 +1415,107 @@ class RBYCart: # These are almost immediately after the Pokédex entries themselves, # but this actually seems easier than figuring out where a table of # pointers ends. - asm_IndexToPokedex_rx = re.compile(rb''' - \xc5 # push bc - \xe5 # push hl - \xfa (..) # ld a,[wd11e] - \x3d # dec a - \x21 (..) # ld hl,PokedexOrder - \x06 \x00 # ld b,0 - \x4f # ld c,a - \x09 # add hl,bc - \x7e # ld a,[hl] - \xea \1 # ld [wd11e],a - \xe1 # pop hl - \xc1 # pop bc - \xc9 # ret - ''', flags=re.DOTALL | re.VERBOSE) - for match in asm_IndexToPokedex_rx.finditer(self.data): - matched_addr = ULInt16('...').parse(match.group(2)) - tentative_addr = match.end() + match = find_code(self.data, ''' + push bc + push hl + ld a, [#wd11e] + dec a + ld hl, #PokedexOrder + ld b, 0 + ld c, a + add hl, bc + ld a, [hl] + ld [#wd11e], a + pop hl + pop bc + ret + ''', wd11e=asm_wd11e_addr) + if match: + rem, inputs = match + matched_addr = inputs['PokedexOrder'] + tentative_addr = rem.end() # Remember, addresses don't include the bank! _, banked_addr = bank(tentative_addr) - if matched_addr == banked_addr and asm_wd11e_addr == match.group(1): + if matched_addr == banked_addr: addresses['PokedexOrder'] = tentative_addr - break + else: + raise RuntimeError else: raise CartDetectionError("Can't find Pokédex order") - # This is assembly code that appears near the end of a function called - # WaitForSoundToFinish. - end_of_WaitForSoundToFinish = bytes.fromhex('afb6 23b6 2323 b6') - try: - idx = self.data.index(end_of_WaitForSoundToFinish) - except ValueError: - raise CartDetectionError("Can't find name array") - # There are a couple more bytes in the function, but they involve an - # address so they can't be searched for. Red/Green/Blue have four; - # Yellow has an extra 'and', which is annoying, but at least easy to - # handle. - start = idx + len(end_of_WaitForSoundToFinish) - if self.data[start] == 0xA7: - # Yellow; skip one more byte - start += 1 - start += 4 - - name_pointers = Array(7, ULInt16('dummy')).parse(self.data[start:start + 14]) - # One downside to the Game Boy memory structure is that banks are not - # stored anywhere near their corresponding addresses, so the bank - # numbers are hardcoded here. They're fairly unlikely to change - # between games. Right? Probably? - addresses['MonsterNames'] = unbank(0x07, name_pointers[0]) - addresses['MoveNames'] = unbank(0x2C, name_pointers[1]) - # 2: UnusedNames (unused, obviously) - addresses['ItemNames'] = unbank(0x01, name_pointers[3]) - # 4: wPartyMonOT (only useful while the game is running) - # 5: wEnemyMonOT (only useful while the game is running) - addresses['TrainerNames'] = unbank(0x0E, name_pointers[6]) + # Ah, but then, we have base stats. These don't have code nearby; + # they're just stuck immediately after moves. Except in R/G, where + # they appear /before/ moves! And we don't know what version we're + # running yet, because the addresses detected in this method are used + # for language detection. Hmm. + # Here's plan B: look for the function that /loads/ base stats, and + # scrape the address out of it. This function is a bit hairy; I've had + # to expand some of pokered's macros and rewrite the jumps to something + # that the rudimentary code matcher can understand. + match = find_code(self.data, ''' + ldh a, [#H_LOADEDROMBANK] + push af + ld a, #BANK_BaseStats + ldh [#H_LOADEDROMBANK], a + ld [#MBC1RomBank], a + push bc + push de + push hl + ld a, [#wd11e] + push af + ld a,[#wd0b5] + ld [#wd11e],a + ld de,#FossilKabutopsPic + ld b,$66 ; size of Kabutops fossil and Ghost sprites + cp #FOSSIL_KABUTOPS ; Kabutops fossil + jr z,#specialID1 + ld de,#GhostPic + cp #MON_GHOST ; Ghost + jr z,#specialID2 + ld de,#FossilAerodactylPic + ld b,$77 ; size of Aerodactyl fossil sprite + cp #FOSSIL_AERODACTYL ; Aerodactyl fossil + jr z,#specialID3 + cp #MEW + jr z,#mew + ld a, #IndexToPokedexPredef + call #IndexToPokedex ; convert pokemon ID in [wd11e] to pokedex number + ld a,[#wd11e] + dec a + ld bc, #MonBaseStatsLength + ld hl, #BaseStats + call #AddNTimes + ld de, #wMonHeader + ld bc, #MonBaseStatsLength + call #CopyData + jr #done1 + ;.specialID + ld hl, #wMonHSpriteDim + ld [hl], b ; write sprite dimensions + inc hl + ld [hl], e ; write front sprite pointer + inc hl + ld [hl], d + jr #done2 + ;.mew + ld hl, #MewBaseStats + ld de, #wMonHeader + ld bc, #MonBaseStatsLength + ld a, #BANK_MewBaseStats + call #FarCopyData + ''', + # These are constants; I left them in the above code for clarity + H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF + MBC1RomBank=0x2000, + # This was scraped previously + wd11e=asm_wd11e_addr, + ) + if match: + rem, inputs = match + addresses['BaseStats'] = unbank(inputs['BANK_BaseStats'], inputs['BaseStats']) + addresses['MewBaseStats'] = unbank(inputs['BANK_MewBaseStats'], inputs['MewBaseStats']) + else: + raise CartDetectionError("Can't find base stats") return addresses @@ -1612,7 +1714,12 @@ class RBYCart: ret = [None] * self.NUM_POKEMON self.stream.seek(self.addrs['MonsterNames']) - for index, pokemon_name in enumerate(Array(self.max_pokemon_index, pokemon_name_struct).parse_stream(self.stream), start=1): + # TODO i don't like this, but they don't have explicit terminators... + if self.language == 'ja': + name_length = 5 + else: + name_length = 10 + for index, pokemon_name in enumerate(Array(self.max_pokemon_index, PokemonCString('...', name_length)).parse_stream(self.stream), start=1): try: id = self.pokedex_order[index] except KeyError: @@ -1631,7 +1738,6 @@ class RBYCart: def pokemon_records(self): """List of pokemon_structs.""" self.stream.seek(self.addrs['BaseStats']) - print(self.stream.read(100).hex()) records = Array(self.NUM_POKEMON - 1, pokemon_struct).parse_stream(self.stream) # Mew's data is, awkwardly, stored separately self.stream.seek(self.addrs['MewBaseStats'])