Write a gbz80 pattern matcher, for more robust address sniffing

With this, Pokémon names are finally detected correctly from both R/G
and R/B.  Hurrah.

Yellow doesn't work yet, though.  Sigh.
This commit is contained in:
Eevee (Lexy Munroe) 2016-08-21 16:44:07 -07:00
parent 969d671c48
commit 122da8d885
2 changed files with 841 additions and 88 deletions

View file

@ -0,0 +1,647 @@
"""Stuff for dealing with the Game Boy's Z80-ish machine code. Most notably,
can do pattern-matching against a chunk of assembly with missing values.
"""
from collections import OrderedDict
from collections import defaultdict
import re
import attr
# TODO: would be nice to understand "cp a, #foo" and similar for xor/or/and/etc
# TODO: would be AMAZING to understand labels when searching for code wow
# This table is courtesy of pokemon-reverse-engineering-tools:
# https://github.com/pret/pokemon-reverse-engineering-tools/blob/master/pokemontools/gbz80disasm.py
gbz80_bitops = dict(enumerate([
"rlc b", "rlc c", "rlc d", "rlc e", "rlc h", "rlc l", "rlc [hl]", "rlc a", # $00 - $07
"rrc b", "rrc c", "rrc d", "rrc e", "rrc h", "rrc l", "rrc [hl]", "rrc a", # $08 - $0f
"rl b", "rl c", "rl d", "rl e", "rl h", "rl l", "rl [hl]", "rl a", # $10 - $17
"rr b", "rr c", "rr d", "rr e", "rr h", "rr l", "rr [hl]", "rr a", # $18 - $1f
"sla b", "sla c", "sla d", "sla e", "sla h", "sla l", "sla [hl]", "sla a", # $20 - $27
"sra b", "sra c", "sra d", "sra e", "sra h", "sra l", "sra [hl]", "sra a", # $28 - $2f
"swap b", "swap c", "swap d", "swap e", "swap h", "swap l", "swap [hl]", "swap a", # $30 - $37
"srl b", "srl c", "srl d", "srl e", "srl h", "srl l", "srl [hl]", "srl a", # $38 - $3f
"bit $00, b", "bit $00, c", "bit $00, d", "bit $00, e", "bit $00, h", "bit $00, l", "bit $00, [hl]", "bit $00, a", # $40 - $47
"bit $01, b", "bit $01, c", "bit $01, d", "bit $01, e", "bit $01, h", "bit $01, l", "bit $01, [hl]", "bit $01, a", # $48 - $4f
"bit $02, b", "bit $02, c", "bit $02, d", "bit $02, e", "bit $02, h", "bit $02, l", "bit $02, [hl]", "bit $02, a", # $50 - $57
"bit $03, b", "bit $03, c", "bit $03, d", "bit $03, e", "bit $03, h", "bit $03, l", "bit $03, [hl]", "bit $03, a", # $58 - $5f
"bit $04, b", "bit $04, c", "bit $04, d", "bit $04, e", "bit $04, h", "bit $04, l", "bit $04, [hl]", "bit $04, a", # $60 - $67
"bit $05, b", "bit $05, c", "bit $05, d", "bit $05, e", "bit $05, h", "bit $05, l", "bit $05, [hl]", "bit $05, a", # $68 - $6f
"bit $06, b", "bit $06, c", "bit $06, d", "bit $06, e", "bit $06, h", "bit $06, l", "bit $06, [hl]", "bit $06, a", # $70 - $77
"bit $07, b", "bit $07, c", "bit $07, d", "bit $07, e", "bit $07, h", "bit $07, l", "bit $07, [hl]", "bit $07, a", # $78 - $7f
"res $00, b", "res $00, c", "res $00, d", "res $00, e", "res $00, h", "res $00, l", "res $00, [hl]", "res $00, a", # $80 - $87
"res $01, b", "res $01, c", "res $01, d", "res $01, e", "res $01, h", "res $01, l", "res $01, [hl]", "res $01, a", # $88 - $8f
"res $02, b", "res $02, c", "res $02, d", "res $02, e", "res $02, h", "res $02, l", "res $02, [hl]", "res $02, a", # $90 - $97
"res $03, b", "res $03, c", "res $03, d", "res $03, e", "res $03, h", "res $03, l", "res $03, [hl]", "res $03, a", # $98 - $9f
"res $04, b", "res $04, c", "res $04, d", "res $04, e", "res $04, h", "res $04, l", "res $04, [hl]", "res $04, a", # $a0 - $a7
"res $05, b", "res $05, c", "res $05, d", "res $05, e", "res $05, h", "res $05, l", "res $05, [hl]", "res $05, a", # $a8 - $af
"res $06, b", "res $06, c", "res $06, d", "res $06, e", "res $06, h", "res $06, l", "res $06, [hl]", "res $06, a", # $b0 - $b7
"res $07, b", "res $07, c", "res $07, d", "res $07, e", "res $07, h", "res $07, l", "res $07, [hl]", "res $07, a", # $b8 - $bf
"set $00, b", "set $00, c", "set $00, d", "set $00, e", "set $00, h", "set $00, l", "set $00, [hl]", "set $00, a", # $c0 - $c7
"set $01, b", "set $01, c", "set $01, d", "set $01, e", "set $01, h", "set $01, l", "set $01, [hl]", "set $01, a", # $c8 - $cf
"set $02, b", "set $02, c", "set $02, d", "set $02, e", "set $02, h", "set $02, l", "set $02, [hl]", "set $02, a", # $d0 - $d7
"set $03, b", "set $03, c", "set $03, d", "set $03, e", "set $03, h", "set $03, l", "set $03, [hl]", "set $03, a", # $d8 - $df
"set $04, b", "set $04, c", "set $04, d", "set $04, e", "set $04, h", "set $04, l", "set $04, [hl]", "set $04, a", # $e0 - $e7
"set $05, b", "set $05, c", "set $05, d", "set $05, e", "set $05, h", "set $05, l", "set $05, [hl]", "set $05, a", # $e8 - $ef
"set $06, b", "set $06, c", "set $06, d", "set $06, e", "set $06, h", "set $06, l", "set $06, [hl]", "set $06, a", # $f0 - $f7
"set $07, b", "set $07, c", "set $07, d", "set $07, e", "set $07, h", "set $07, l", "set $07, [hl]", "set $07, a" # $f8 - $ff
]))
# This instruction list was carefully scraped from:
# http://www.pastraiser.com/cpu/gameboy/gameboy_opcodes.html
gbz80_instructions = {
0x00: 'nop',
0x01: 'ld bc, #d16',
0x02: 'ld [bc], a',
0x03: 'inc bc',
0x04: 'inc b',
0x05: 'dec b',
0x06: 'ld b, #d8',
0x07: 'rlca',
0x08: 'ld [#a16], sp',
0x09: 'add hl, bc',
0x0a: 'ld a, [bc]',
0x0b: 'dec bc',
0x0c: 'inc c',
0x0d: 'dec c',
0x0e: 'ld c, #d8',
0x0f: 'rrca',
0x10: 'stop',
0x11: 'ld de, #d16',
0x12: 'ld [de], a',
0x13: 'inc de',
0x14: 'inc d',
0x15: 'dec d',
0x16: 'ld d, #d8',
0x17: 'rla',
0x18: 'jr #r8',
0x19: 'add hl, de',
0x1a: 'ld a, [de]',
0x1b: 'dec de',
0x1c: 'inc e',
0x1d: 'dec e',
0x1e: 'ld e, #d8',
0x1f: 'rra',
0x20: 'jr nz, #r8',
0x21: 'ld hl, #d16',
0x22: 'ld [hl+], a',
0x23: 'inc hl',
0x24: 'inc h',
0x25: 'dec h',
0x26: 'ld h, #d8',
0x27: 'daa',
0x28: 'jr z, #r8',
0x29: 'add hl, hl',
0x2a: 'ld a, [hl+]',
0x2b: 'dec hl',
0x2c: 'inc l',
0x2d: 'dec l',
0x2e: 'ld l, #d8',
0x2f: 'cpl',
0x30: 'jr nc, #r8',
0x31: 'ld sp, #d16',
0x32: 'ld [hl-], a',
0x33: 'inc sp',
0x34: 'inc [hl]',
0x35: 'dec [hl]',
0x36: 'ld [hl], #d8',
0x37: 'scf',
0x38: 'jr c, #r8',
0x39: 'add hl, sp',
0x3a: 'ld a, [hl-]',
0x3b: 'dec sp',
0x3c: 'inc a',
0x3d: 'dec a',
0x3e: 'ld a, #d8',
0x3f: 'ccf',
0x40: 'ld b, b',
0x41: 'ld b, c',
0x42: 'ld b, d',
0x43: 'ld b, e',
0x44: 'ld b, h',
0x45: 'ld b, l',
0x46: 'ld b, [hl]',
0x47: 'ld b, a',
0x48: 'ld c, b',
0x49: 'ld c, c',
0x4a: 'ld c, d',
0x4b: 'ld c, e',
0x4c: 'ld c, h',
0x4d: 'ld c, l',
0x4e: 'ld c, [hl]',
0x4f: 'ld c, a',
0x50: 'ld d, b',
0x51: 'ld d, c',
0x52: 'ld d, d',
0x53: 'ld d, e',
0x54: 'ld d, h',
0x55: 'ld d, l',
0x56: 'ld d, [hl]',
0x57: 'ld d, a',
0x58: 'ld e, b',
0x59: 'ld e, c',
0x5a: 'ld e, d',
0x5b: 'ld e, e',
0x5c: 'ld e, h',
0x5d: 'ld e, l',
0x5e: 'ld e, [hl]',
0x5f: 'ld e, a',
0x60: 'ld h, b',
0x61: 'ld h, c',
0x62: 'ld h, d',
0x63: 'ld h, e',
0x64: 'ld h, h',
0x65: 'ld h, l',
0x66: 'ld h, [hl]',
0x67: 'ld h, a',
0x68: 'ld l, b',
0x69: 'ld l, c',
0x6a: 'ld l, d',
0x6b: 'ld l, e',
0x6c: 'ld l, h',
0x6d: 'ld l, l',
0x6e: 'ld l, [hl]',
0x6f: 'ld l, a',
0x70: 'ld [hl], b',
0x71: 'ld [hl], c',
0x72: 'ld [hl], d',
0x73: 'ld [hl], e',
0x74: 'ld [hl], h',
0x75: 'ld [hl], l',
0x76: 'halt',
0x77: 'ld [hl], a',
0x78: 'ld a, b',
0x79: 'ld a, c',
0x7a: 'ld a, d',
0x7b: 'ld a, e',
0x7c: 'ld a, h',
0x7d: 'ld a, l',
0x7e: 'ld a, [hl]',
0x7f: 'ld a, a',
0x80: 'add a, b',
0x81: 'add a, c',
0x82: 'add a, d',
0x83: 'add a, e',
0x84: 'add a, h',
0x85: 'add a, l',
0x86: 'add a, [hl]',
0x87: 'add a, a',
0x88: 'adc a, b',
0x89: 'adc a, c',
0x8a: 'adc a, d',
0x8b: 'adc a, e',
0x8c: 'adc a, h',
0x8d: 'adc a, l',
0x8e: 'adc a, [hl]',
0x8f: 'adc a, a',
0x90: 'sub b',
0x91: 'sub c',
0x92: 'sub d',
0x93: 'sub e',
0x94: 'sub h',
0x95: 'sub l',
0x96: 'sub [hl]',
0x97: 'sub a',
0x98: 'sbc a, b',
0x99: 'sbc a, c',
0x9a: 'sbc a, d',
0x9b: 'sbc a, e',
0x9c: 'sbc a, h',
0x9d: 'sbc a, l',
0x9e: 'sbc a, [hl]',
0x9f: 'sbc a, a',
0xa0: 'and b',
0xa1: 'and c',
0xa2: 'and d',
0xa3: 'and e',
0xa4: 'and h',
0xa5: 'and l',
0xa6: 'and [hl]',
0xa7: 'and a',
0xa8: 'xor b',
0xa9: 'xor c',
0xaa: 'xor d',
0xab: 'xor e',
0xac: 'xor h',
0xad: 'xor l',
0xae: 'xor [hl]',
0xaf: 'xor a',
0xb0: 'or b',
0xb1: 'or c',
0xb2: 'or d',
0xb3: 'or e',
0xb4: 'or h',
0xb5: 'or l',
0xb6: 'or [hl]',
0xb7: 'or a',
0xb8: 'cp b',
0xb9: 'cp c',
0xba: 'cp d',
0xbb: 'cp e',
0xbc: 'cp h',
0xbd: 'cp l',
0xbe: 'cp [hl]',
0xbf: 'cp a',
0xc0: 'ret nz',
0xc1: 'pop bc',
0xc2: 'jp nz, #a16',
0xc3: 'jp #a16',
0xc4: 'call nz, #a16',
0xc5: 'push bc',
0xc6: 'add a, #d8',
0xc7: 'rst $00',
0xc8: 'ret z',
0xc9: 'ret',
0xca: 'jp z, #a16',
0xcb: gbz80_bitops,
0xcc: 'call z, #a16',
0xcd: 'call #a16',
0xce: 'adc a, #d8',
0xcf: 'rst $08',
0xd0: 'ret nc',
0xd1: 'pop de',
0xd2: 'jp nc, #a16',
# 0xd3
0xd4: 'call nc, #a16',
0xd5: 'push de',
0xd6: 'sub #d8',
0xd7: 'rst $10',
0xd8: 'ret c',
0xd9: 'reti',
0xda: 'jp c, #a16',
# 0xdb
0xdc: 'call c, #a16',
# 0xdd
0xde: 'sbc a, #d8',
0xdf: 'rst $18',
0xe0: 'ldh [#a8], a',
0xe1: 'pop hl',
0xe2: 'ld [$ff00+c], a', # XXX table claims 2 but this looks like 1 to me
# 0xe3
# 0xe4
0xe5: 'push hl',
0xe6: 'and #d8',
0xe7: 'rst $20',
0xe8: 'add sp, #r8',
0xe9: 'jp [hl]',
0xea: 'ld [#a16], a',
# 0xeb
# 0xec
# 0xed
0xee: 'xor #d8',
0xef: 'rst $28',
0xf0: 'ldh a, [#a8]',
0xf1: 'pop af',
0xf2: 'ld a, [$ff00+c]', # XXX table says 1 but this looks like 1 to me
0xf3: 'di',
# 0xf4
0xf5: 'push af',
0xf6: 'or #d8',
0xf7: 'rst $30',
0xf8: 'ld hl, sp+#r8',
0xf9: 'ld sp, hl',
0xfa: 'ld a, [#a16]',
0xfb: 'ei',
# 0xfc
# 0xfd
0xfe: 'cp #d8',
0xff: 'rst $38',
}
class Atom:
is_constant = False
is_input = False
is_register = False
def is_compatible_with(self, other):
return self == other
def render(self):
raise NotImplementedError
@attr.s
class InputAtom(Atom):
name = attr.ib()
length = attr.ib()
is_input = True
def is_compatible_with(self, other):
# Inputs are compatible with anything that's not a register
# TODO does length matter?
return isinstance(other, (InputAtom, ConstantAtom))
def render(self, value=None):
if value is None:
return '#' + self.name
else:
return "${:0{width}x}".format(value, width=self.length * 2)
@attr.s
class RegisterAtom(Atom):
name = attr.ib()
is_register = True
def render(self):
return self.name
@attr.s
class ConstantAtom(Atom):
value = attr.ib()
is_constant = True
def render(self):
if self.value < 256:
return "${:02x}".format(self.value)
else:
return "${:04x}".format(self.value)
class Instruction:
def __init__(self, syntax, prefix, mnemonic, length, args, inputs):
self.syntax = syntax
self.prefix = prefix
self.mnemonic = mnemonic
self.length = length
# list of (items_to_sum, is_pointer) tuples
self.args = args
self.inputs = inputs
@staticmethod
def partial_parse(syntax):
mnemonic, *argstrs = re.split('[, ]+', syntax)
args = []
for argstr in argstrs:
ptr = False
if argstr.startswith('[') and argstr.endswith(']'):
ptr = True
argstr = argstr[1:-1]
atoms = []
for atomstr in re.split('[+](?=.)', argstr):
if atomstr.startswith('#'):
atom = InputAtom(atomstr[1:], None)
elif atomstr in (
'a', 'f', 'b', 'c', 'd', 'e', 'h', 'l',
'af', 'bc', 'de', 'hl', 'hl+', 'hl-',
'sp', 'pc', 'z', 'nz', 'c', 'nc',
):
atom = RegisterAtom(atomstr)
elif atomstr.startswith('$'):
atom = ConstantAtom(int(atomstr[1:], 16))
elif atomstr.isdigit():
atom = ConstantAtom(int(atomstr))
else:
raise SyntaxError(
"Unrecognized argument {!r} in instruction {!r}"
.format(atomstr, syntax))
atoms.append(atom)
args.append((atoms, ptr))
return mnemonic, args
@classmethod
def parse(cls, syntax, prefix):
mnemonic, args = cls.partial_parse(syntax)
inputs = []
length = len(prefix)
for atoms, is_ptr in args:
for atom in atoms:
if not atom.is_input:
continue
if atom.name in ('d16', 'a16'):
atom.length = 2
elif atom.name in ('d8', 'a8', 'r8'):
atom.length = 1
else:
raise SyntaxError(
"Unrecognized input name {}".format(atom.name))
inputs.append(atom)
length += atom.length
self = cls(syntax, prefix, mnemonic, length, args, inputs)
assert self(ignore_inputs=True) == syntax
return self
def __repr__(self):
return "<{} 0x{}: {}>".format(
type(self).__name__,
self.prefix.hex(),
self(ignore_inputs=True),
)
def __call__(self, *inputs, ignore_inputs=False):
inputs = list(inputs)
if not ignore_inputs:
if len(inputs) != len(self.inputs):
raise TypeError(
"{} needs {} inputs, got {}"
.format(self.syntax, len(self.inputs), len(inputs)))
args = []
for atoms, is_ptr in self.args:
atomstrs = []
for atom in atoms:
if not ignore_inputs and atom.is_input:
atomstrs.append(atom.render(inputs.pop(0)))
else:
atomstrs.append(atom.render())
expr = '+'.join(atomstrs)
if is_ptr:
expr = '[' + expr + ']'
args.append(expr)
out = self.mnemonic
if args:
out = out + ' ' + ', '.join(args)
return out
def match_inputs(self, mnemonic, args):
if self.mnemonic != mnemonic:
return
if len(self.args) != len(args):
return
# Compare args
input_pairs = []
for (atoms1, ptr1), (atoms2, ptr2) in zip(self.args, args):
if ptr1 != ptr2:
return
if len(atoms1) != len(atoms2):
return
# TODO technically, A+B is the same as B+A, but the lists are het
# so i can't sort
# TODO also, constant folding could make A+B the same as C
for atom1, atom2 in zip(atoms1, atoms2):
if not atom1.is_compatible_with(atom2):
return
if atom1.is_input:
input_pairs.append((atom1, atom2))
return input_pairs
class InstructionSet:
def __init__(self, instructions):
self.instructions = {}
self.mnemonics = defaultdict(set)
self._load_instructions(instructions)
def _load_instructions(self, instructions, *, prefix=b''):
for n, syntax in instructions.items():
byte = bytes([n])
if isinstance(syntax, dict):
# Nested args, for the bitops instruction
self._load_instructions(syntax, prefix=byte)
else:
instr = Instruction.parse(syntax, prefix + byte)
self.instructions[prefix + byte] = instr
self.mnemonics[instr.mnemonic].add(instr)
gbz80 = InstructionSet(gbz80_instructions)
needle = """push bc
push hl
ld a, [#wd11e]
dec a
ld hl, #PokedexOrder
ld b, 0
ld c, a
add hl, bc
ld a, [hl]
ld [#wd11e], a
pop hl
pop bc
ret"""
"""
\xc5
\xe5
\xfa (..)
\x3d
\x21 (..)
\x06 \x00
\x4f
\x09
\x7e
\xea \1
\xe1
\xc1
\xc9
"""
haystack = b'\xc5\xe5\xfaXY\x3d\x21ZW\x06\x00\x4f\x09\x7e\xeaXY\xe1\xc1\xc9'
# ------------------------------------------------------------------------------
# Disassemble
def disassemble(haystack):
i = 0
while i < len(haystack):
for l in range(2):
prefix = haystack[i:i+l+1]
if prefix in gbz80.instructions:
instr = gbz80.instructions[prefix]
break
else:
raise SyntaxError
i += len(prefix)
inputs = []
for inp in instr.inputs:
inputs.append(int.from_bytes(
haystack[i:i + inp.length], byteorder='little'))
i += inp.length
print(instr(*inputs))
# ------------------------------------------------------------------------------
# Pattern match
def find_code(haystack, needle, **kwargs):
# TODO error if something in kwargs isn't a pattern input?
# TODO the return value here is goofy
# TODO maybe use finditer and yield instead
pattern_chunks = []
input_table = OrderedDict()
matched_instructions = []
for instruction in needle.splitlines():
instruction = re.sub(';.*', '', instruction).strip()
if not instruction:
continue
mnemonic, args = Instruction.partial_parse(instruction)
candidates = gbz80.mnemonics[mnemonic]
for candidate in candidates:
inputs = candidate.match_inputs(mnemonic, args)
if inputs is not None:
break
else:
raise SyntaxError
instr = candidate
pattern_chunks.append(re.escape(instr.prefix))
pattern_atoms = []
for instr_input, pattern_atom in inputs:
pattern_atoms.append(pattern_atom)
if pattern_atom.is_constant:
pattern_chunks.append(re.escape(
pattern_atom.value.to_bytes(
instr_input.length, byteorder='little')))
elif pattern_atom.name in input_table:
pattern_chunks.append(input_table[pattern_atom.name])
else:
if pattern_atom.name in kwargs:
inner_pattern = re.escape(
kwargs[pattern_atom.name].to_bytes(
instr_input.length, byteorder='little'))
else:
inner_pattern = b'.' * instr_input.length
group_name = pattern_atom.name.encode('ascii')
input_table[pattern_atom.name] = b'(?P=%b)' % (group_name,)
pattern_chunks.append(b'(?P<%b>%b)' % (group_name, inner_pattern))
matched_instructions.append((instr, pattern_atoms))
pattern = b''.join(pattern_chunks)
m = re.search(pattern, haystack, flags=re.DOTALL)
if m:
matched_inputs = {}
for inp in input_table:
matched_inputs[inp] = int.from_bytes(
m.group(inp), byteorder='little')
for instr, pattern_atoms in matched_instructions:
inputs = []
for atom in pattern_atoms:
if atom.is_constant:
inputs.append(atom.value)
else:
inputs.append(matched_inputs[atom.name])
return m, matched_inputs
else:
return

View file

@ -14,13 +14,13 @@ import hashlib
import io import io
import logging import logging
from pathlib import Path from pathlib import Path
import re
import sys import sys
from camel import Camel from camel import Camel
from classtools import reify from classtools import reify
from construct import * from construct import *
from pokedex.extract.lib.gbz80 import find_code
import pokedex.schema as schema import pokedex.schema as schema
# TODO set this up to colorcode and use {} formatting # TODO set this up to colorcode and use {} formatting
@ -1167,8 +1167,6 @@ pokemon_struct = Struct(
Padding(1), Padding(1),
) )
pokemon_name_struct = PokemonCString('pokemon_name', 10)
evos_moves_struct = Struct( evos_moves_struct = Struct(
'evos_moves', 'evos_moves',
@ -1281,28 +1279,17 @@ class RBYCart:
Return a dict of raw file offsets. The keys are the names used in the Return a dict of raw file offsets. The keys are the names used in the
pokered project. pokered project.
""" """
# The base stats are always in the same place in RBY, and only slightly # The ideal approach is to find some assembly code that appears just
# off in RG. Not sure why! But it hopefully means recompilation # before the data of interest. It's pretty hacky, but since
# doesn't affect them. # translators (and even modders) would have little reason to rearrange
addresses = { # functions or inject new ones in these odd places, it ought to work
# These ones have, thusfar, defied automatic detection, as they're # well enough. And it's better than ferreting out and hard-coding
# just part of a big old block of data — so I can't just look for # piles of addresses.
# code nearby.
# TODO these are for rby; fix for rg, and maybe y?
'BaseStats': unbank('0E:43DE'),
'MewBaseStats': unbank('01:425B'),
}
# For everything else, the general approach is to find some assembly
# code that appears just before the data of interest. It's pretty
# hacky, but since translators (and even modders) would have little
# reason to rearrange functions or inject new ones in these odd places,
# it ought to work well enough. And it's better than ferreting out and
# hard-coding piles of addresses.
# The only hard part is that assembly code that contains an address # The only hard part is that assembly code that contains an address
# won't work, since that address will also vary per game. # won't work, since that address will also vary per game.
# Each of the landmarks used here appears in every official cartridge # Each of the landmarks used here appears in every official cartridge
# exactly once. # exactly once.
addresses = {}
# This is an entire function used by the Pokédex and which immediately # This is an entire function used by the Pokédex and which immediately
# precedes all the flavor text. # precedes all the flavor text.
@ -1322,6 +1309,72 @@ class RBYCart:
raise CartDetectionError("Can't find evolution and moveset table") raise CartDetectionError("Can't find evolution and moveset table")
addresses['EvosMovesPointerTable'] = idx + len(asm_WriteMonMoves_ShiftMoveData) + 5 addresses['EvosMovesPointerTable'] = idx + len(asm_WriteMonMoves_ShiftMoveData) + 5
# Several lists of names are accessed by a single function, which looks
# through a list of pointers to find the right set of names to use.
# That's great news for me: I can just grab all of those delicious
# pointers at once. Here's an excerpt from GetName.
match = find_code(self.data, '''
inc d
;.skip
ld hl, #NamePointers
add hl,de
ld a,[hl+]
ldh [$96],a
ld a,[hl]
ldh [$95],a
ldh a,[$95]
ld h,a
ldh a,[$96]
ld l,a
ld a,[#wd0b5]
ld b,a
ld c,0
;.nextName
ld d,h
ld e,l
;.nextChar
ld a,[hl+]
cp $50 ; terminator @, encoded
''')
if not match:
raise CartDetectionError("Can't find name array")
rem, inputs = match
start = inputs['NamePointers']
name_pointers = Array(7, ULInt16('dummy')).parse(
self.data[start:start + 14])
# One downside to the Game Boy memory structure is that banks are
# not stored anywhere near their corresponding addresses. Most
# bank numbers are hardcoded here, but Pokémon names are in a different
# bank in Japanese games, so we've gotta scrape the bank too...
match = find_code(self.data, '''
;GetMonName::
push hl
ldh a,[#H_LOADEDROMBANK]
push af
ld a,#BANK_MonsterNames
ldh [#H_LOADEDROMBANK],a
ld [#MBC1RomBank],a
ld a,[#wd11e]
dec a
ld hl,#MonsterNames
''',
H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF
MBC1RomBank=0x2000,
MonsterNames=name_pointers[0]
)
if not match:
raise CartDetectionError("Can't find Pokémon names")
rem, inputs = match
addresses['MonsterNames'] = unbank(
inputs['BANK_MonsterNames'], name_pointers[0])
addresses['MoveNames'] = unbank(0x2C, name_pointers[1])
# 2: UnusedNames (unused, obviously)
addresses['ItemNames'] = unbank(0x01, name_pointers[3])
# 4: wPartyMonOT (only useful while the game is running)
# 5: wEnemyMonOT (only useful while the game is running)
addresses['TrainerNames'] = unbank(0x0E, name_pointers[6])
# Finding TMs is a bit harder. They come right after a function for # Finding TMs is a bit harder. They come right after a function for
# looking up a TM number, which is very short and very full of # looking up a TM number, which is very short and very full of
# addresses. So here's a regex. # addresses. So here's a regex.
@ -1330,26 +1383,29 @@ class RBYCart:
# In English it is, unsurprisingly, 0xD11E. # In English it is, unsurprisingly, 0xD11E.
# `TechnicalMachines` is the address we're looking for, which should # `TechnicalMachines` is the address we're looking for, which should
# immediately follow what this matches. # immediately follow what this matches.
asm_TMToMove_rx = re.compile(rb''' match = find_code(self.data, '''
\xfa (..) # ld a, [wd11e] ld a, [#wd11e]
\x3d # dec a dec a
\x21 (..) # ld hl, TechnicalMachines ld hl, #TechnicalMachines
\x06 \x00 # ld b, $0 ld b, $0
\x4f # ld c, a ld c, a
\x09 # add hl, bc add hl, bc
\x7e # ld a, [hl] ld a, [hl]
\xea \1 # ld [wd11e], a ld [#wd11e], a
\xc9 # ret ret
''', flags=re.DOTALL | re.VERBOSE) ''')
for match in asm_TMToMove_rx.finditer(self.data): if match:
matched_addr = ULInt16('...').parse(match.group(2)) rem, inputs = match
tentative_addr = match.end() # TODO this should mayybe also check that the address immediately follows this code
matched_addr = inputs['TechnicalMachines']
tentative_addr = rem.end()
# Remember, addresses don't include the bank! # Remember, addresses don't include the bank!
_, banked_addr = bank(tentative_addr) _, banked_addr = bank(tentative_addr)
if matched_addr == banked_addr: if matched_addr == banked_addr:
asm_wd11e_addr = match.group(1) asm_wd11e_addr = inputs['wd11e']
addresses['TechnicalMachines'] = tentative_addr addresses['TechnicalMachines'] = tentative_addr
break else:
raise RuntimeError
# TODO should there really be more than one match? # TODO should there really be more than one match?
else: else:
raise CartDetectionError("Can't find technical machines list") raise CartDetectionError("Can't find technical machines list")
@ -1359,61 +1415,107 @@ class RBYCart:
# These are almost immediately after the Pokédex entries themselves, # These are almost immediately after the Pokédex entries themselves,
# but this actually seems easier than figuring out where a table of # but this actually seems easier than figuring out where a table of
# pointers ends. # pointers ends.
asm_IndexToPokedex_rx = re.compile(rb''' match = find_code(self.data, '''
\xc5 # push bc push bc
\xe5 # push hl push hl
\xfa (..) # ld a,[wd11e] ld a, [#wd11e]
\x3d # dec a dec a
\x21 (..) # ld hl,PokedexOrder ld hl, #PokedexOrder
\x06 \x00 # ld b,0 ld b, 0
\x4f # ld c,a ld c, a
\x09 # add hl,bc add hl, bc
\x7e # ld a,[hl] ld a, [hl]
\xea \1 # ld [wd11e],a ld [#wd11e], a
\xe1 # pop hl pop hl
\xc1 # pop bc pop bc
\xc9 # ret ret
''', flags=re.DOTALL | re.VERBOSE) ''', wd11e=asm_wd11e_addr)
for match in asm_IndexToPokedex_rx.finditer(self.data): if match:
matched_addr = ULInt16('...').parse(match.group(2)) rem, inputs = match
tentative_addr = match.end() matched_addr = inputs['PokedexOrder']
tentative_addr = rem.end()
# Remember, addresses don't include the bank! # Remember, addresses don't include the bank!
_, banked_addr = bank(tentative_addr) _, banked_addr = bank(tentative_addr)
if matched_addr == banked_addr and asm_wd11e_addr == match.group(1): if matched_addr == banked_addr:
addresses['PokedexOrder'] = tentative_addr addresses['PokedexOrder'] = tentative_addr
break else:
raise RuntimeError
else: else:
raise CartDetectionError("Can't find Pokédex order") raise CartDetectionError("Can't find Pokédex order")
# This is assembly code that appears near the end of a function called # Ah, but then, we have base stats. These don't have code nearby;
# WaitForSoundToFinish. # they're just stuck immediately after moves. Except in R/G, where
end_of_WaitForSoundToFinish = bytes.fromhex('afb6 23b6 2323 b6') # they appear /before/ moves! And we don't know what version we're
try: # running yet, because the addresses detected in this method are used
idx = self.data.index(end_of_WaitForSoundToFinish) # for language detection. Hmm.
except ValueError: # Here's plan B: look for the function that /loads/ base stats, and
raise CartDetectionError("Can't find name array") # scrape the address out of it. This function is a bit hairy; I've had
# There are a couple more bytes in the function, but they involve an # to expand some of pokered's macros and rewrite the jumps to something
# address so they can't be searched for. Red/Green/Blue have four; # that the rudimentary code matcher can understand.
# Yellow has an extra 'and', which is annoying, but at least easy to match = find_code(self.data, '''
# handle. ldh a, [#H_LOADEDROMBANK]
start = idx + len(end_of_WaitForSoundToFinish) push af
if self.data[start] == 0xA7: ld a, #BANK_BaseStats
# Yellow; skip one more byte ldh [#H_LOADEDROMBANK], a
start += 1 ld [#MBC1RomBank], a
start += 4 push bc
push de
name_pointers = Array(7, ULInt16('dummy')).parse(self.data[start:start + 14]) push hl
# One downside to the Game Boy memory structure is that banks are not ld a, [#wd11e]
# stored anywhere near their corresponding addresses, so the bank push af
# numbers are hardcoded here. They're fairly unlikely to change ld a,[#wd0b5]
# between games. Right? Probably? ld [#wd11e],a
addresses['MonsterNames'] = unbank(0x07, name_pointers[0]) ld de,#FossilKabutopsPic
addresses['MoveNames'] = unbank(0x2C, name_pointers[1]) ld b,$66 ; size of Kabutops fossil and Ghost sprites
# 2: UnusedNames (unused, obviously) cp #FOSSIL_KABUTOPS ; Kabutops fossil
addresses['ItemNames'] = unbank(0x01, name_pointers[3]) jr z,#specialID1
# 4: wPartyMonOT (only useful while the game is running) ld de,#GhostPic
# 5: wEnemyMonOT (only useful while the game is running) cp #MON_GHOST ; Ghost
addresses['TrainerNames'] = unbank(0x0E, name_pointers[6]) jr z,#specialID2
ld de,#FossilAerodactylPic
ld b,$77 ; size of Aerodactyl fossil sprite
cp #FOSSIL_AERODACTYL ; Aerodactyl fossil
jr z,#specialID3
cp #MEW
jr z,#mew
ld a, #IndexToPokedexPredef
call #IndexToPokedex ; convert pokemon ID in [wd11e] to pokedex number
ld a,[#wd11e]
dec a
ld bc, #MonBaseStatsLength
ld hl, #BaseStats
call #AddNTimes
ld de, #wMonHeader
ld bc, #MonBaseStatsLength
call #CopyData
jr #done1
;.specialID
ld hl, #wMonHSpriteDim
ld [hl], b ; write sprite dimensions
inc hl
ld [hl], e ; write front sprite pointer
inc hl
ld [hl], d
jr #done2
;.mew
ld hl, #MewBaseStats
ld de, #wMonHeader
ld bc, #MonBaseStatsLength
ld a, #BANK_MewBaseStats
call #FarCopyData
''',
# These are constants; I left them in the above code for clarity
H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF
MBC1RomBank=0x2000,
# This was scraped previously
wd11e=asm_wd11e_addr,
)
if match:
rem, inputs = match
addresses['BaseStats'] = unbank(inputs['BANK_BaseStats'], inputs['BaseStats'])
addresses['MewBaseStats'] = unbank(inputs['BANK_MewBaseStats'], inputs['MewBaseStats'])
else:
raise CartDetectionError("Can't find base stats")
return addresses return addresses
@ -1612,7 +1714,12 @@ class RBYCart:
ret = [None] * self.NUM_POKEMON ret = [None] * self.NUM_POKEMON
self.stream.seek(self.addrs['MonsterNames']) self.stream.seek(self.addrs['MonsterNames'])
for index, pokemon_name in enumerate(Array(self.max_pokemon_index, pokemon_name_struct).parse_stream(self.stream), start=1): # TODO i don't like this, but they don't have explicit terminators...
if self.language == 'ja':
name_length = 5
else:
name_length = 10
for index, pokemon_name in enumerate(Array(self.max_pokemon_index, PokemonCString('...', name_length)).parse_stream(self.stream), start=1):
try: try:
id = self.pokedex_order[index] id = self.pokedex_order[index]
except KeyError: except KeyError:
@ -1631,7 +1738,6 @@ class RBYCart:
def pokemon_records(self): def pokemon_records(self):
"""List of pokemon_structs.""" """List of pokemon_structs."""
self.stream.seek(self.addrs['BaseStats']) self.stream.seek(self.addrs['BaseStats'])
print(self.stream.read(100).hex())
records = Array(self.NUM_POKEMON - 1, pokemon_struct).parse_stream(self.stream) records = Array(self.NUM_POKEMON - 1, pokemon_struct).parse_stream(self.stream)
# Mew's data is, awkwardly, stored separately # Mew's data is, awkwardly, stored separately
self.stream.seek(self.addrs['MewBaseStats']) self.stream.seek(self.addrs['MewBaseStats'])