mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Write a gbz80 pattern matcher, for more robust address sniffing
With this, Pokémon names are finally detected correctly from both R/G and R/B. Hurrah. Yellow doesn't work yet, though. Sigh.
This commit is contained in:
parent
969d671c48
commit
122da8d885
2 changed files with 841 additions and 88 deletions
647
pokedex/extract/lib/gbz80.py
Normal file
647
pokedex/extract/lib/gbz80.py
Normal file
|
@ -0,0 +1,647 @@
|
|||
"""Stuff for dealing with the Game Boy's Z80-ish machine code. Most notably,
|
||||
can do pattern-matching against a chunk of assembly with missing values.
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
import attr
|
||||
|
||||
# TODO: would be nice to understand "cp a, #foo" and similar for xor/or/and/etc
|
||||
# TODO: would be AMAZING to understand labels when searching for code wow
|
||||
|
||||
# This table is courtesy of pokemon-reverse-engineering-tools:
|
||||
# https://github.com/pret/pokemon-reverse-engineering-tools/blob/master/pokemontools/gbz80disasm.py
|
||||
gbz80_bitops = dict(enumerate([
|
||||
"rlc b", "rlc c", "rlc d", "rlc e", "rlc h", "rlc l", "rlc [hl]", "rlc a", # $00 - $07
|
||||
"rrc b", "rrc c", "rrc d", "rrc e", "rrc h", "rrc l", "rrc [hl]", "rrc a", # $08 - $0f
|
||||
"rl b", "rl c", "rl d", "rl e", "rl h", "rl l", "rl [hl]", "rl a", # $10 - $17
|
||||
"rr b", "rr c", "rr d", "rr e", "rr h", "rr l", "rr [hl]", "rr a", # $18 - $1f
|
||||
"sla b", "sla c", "sla d", "sla e", "sla h", "sla l", "sla [hl]", "sla a", # $20 - $27
|
||||
"sra b", "sra c", "sra d", "sra e", "sra h", "sra l", "sra [hl]", "sra a", # $28 - $2f
|
||||
"swap b", "swap c", "swap d", "swap e", "swap h", "swap l", "swap [hl]", "swap a", # $30 - $37
|
||||
"srl b", "srl c", "srl d", "srl e", "srl h", "srl l", "srl [hl]", "srl a", # $38 - $3f
|
||||
"bit $00, b", "bit $00, c", "bit $00, d", "bit $00, e", "bit $00, h", "bit $00, l", "bit $00, [hl]", "bit $00, a", # $40 - $47
|
||||
"bit $01, b", "bit $01, c", "bit $01, d", "bit $01, e", "bit $01, h", "bit $01, l", "bit $01, [hl]", "bit $01, a", # $48 - $4f
|
||||
"bit $02, b", "bit $02, c", "bit $02, d", "bit $02, e", "bit $02, h", "bit $02, l", "bit $02, [hl]", "bit $02, a", # $50 - $57
|
||||
"bit $03, b", "bit $03, c", "bit $03, d", "bit $03, e", "bit $03, h", "bit $03, l", "bit $03, [hl]", "bit $03, a", # $58 - $5f
|
||||
"bit $04, b", "bit $04, c", "bit $04, d", "bit $04, e", "bit $04, h", "bit $04, l", "bit $04, [hl]", "bit $04, a", # $60 - $67
|
||||
"bit $05, b", "bit $05, c", "bit $05, d", "bit $05, e", "bit $05, h", "bit $05, l", "bit $05, [hl]", "bit $05, a", # $68 - $6f
|
||||
"bit $06, b", "bit $06, c", "bit $06, d", "bit $06, e", "bit $06, h", "bit $06, l", "bit $06, [hl]", "bit $06, a", # $70 - $77
|
||||
"bit $07, b", "bit $07, c", "bit $07, d", "bit $07, e", "bit $07, h", "bit $07, l", "bit $07, [hl]", "bit $07, a", # $78 - $7f
|
||||
"res $00, b", "res $00, c", "res $00, d", "res $00, e", "res $00, h", "res $00, l", "res $00, [hl]", "res $00, a", # $80 - $87
|
||||
"res $01, b", "res $01, c", "res $01, d", "res $01, e", "res $01, h", "res $01, l", "res $01, [hl]", "res $01, a", # $88 - $8f
|
||||
"res $02, b", "res $02, c", "res $02, d", "res $02, e", "res $02, h", "res $02, l", "res $02, [hl]", "res $02, a", # $90 - $97
|
||||
"res $03, b", "res $03, c", "res $03, d", "res $03, e", "res $03, h", "res $03, l", "res $03, [hl]", "res $03, a", # $98 - $9f
|
||||
"res $04, b", "res $04, c", "res $04, d", "res $04, e", "res $04, h", "res $04, l", "res $04, [hl]", "res $04, a", # $a0 - $a7
|
||||
"res $05, b", "res $05, c", "res $05, d", "res $05, e", "res $05, h", "res $05, l", "res $05, [hl]", "res $05, a", # $a8 - $af
|
||||
"res $06, b", "res $06, c", "res $06, d", "res $06, e", "res $06, h", "res $06, l", "res $06, [hl]", "res $06, a", # $b0 - $b7
|
||||
"res $07, b", "res $07, c", "res $07, d", "res $07, e", "res $07, h", "res $07, l", "res $07, [hl]", "res $07, a", # $b8 - $bf
|
||||
"set $00, b", "set $00, c", "set $00, d", "set $00, e", "set $00, h", "set $00, l", "set $00, [hl]", "set $00, a", # $c0 - $c7
|
||||
"set $01, b", "set $01, c", "set $01, d", "set $01, e", "set $01, h", "set $01, l", "set $01, [hl]", "set $01, a", # $c8 - $cf
|
||||
"set $02, b", "set $02, c", "set $02, d", "set $02, e", "set $02, h", "set $02, l", "set $02, [hl]", "set $02, a", # $d0 - $d7
|
||||
"set $03, b", "set $03, c", "set $03, d", "set $03, e", "set $03, h", "set $03, l", "set $03, [hl]", "set $03, a", # $d8 - $df
|
||||
"set $04, b", "set $04, c", "set $04, d", "set $04, e", "set $04, h", "set $04, l", "set $04, [hl]", "set $04, a", # $e0 - $e7
|
||||
"set $05, b", "set $05, c", "set $05, d", "set $05, e", "set $05, h", "set $05, l", "set $05, [hl]", "set $05, a", # $e8 - $ef
|
||||
"set $06, b", "set $06, c", "set $06, d", "set $06, e", "set $06, h", "set $06, l", "set $06, [hl]", "set $06, a", # $f0 - $f7
|
||||
"set $07, b", "set $07, c", "set $07, d", "set $07, e", "set $07, h", "set $07, l", "set $07, [hl]", "set $07, a" # $f8 - $ff
|
||||
]))
|
||||
|
||||
# This instruction list was carefully scraped from:
|
||||
# http://www.pastraiser.com/cpu/gameboy/gameboy_opcodes.html
|
||||
gbz80_instructions = {
|
||||
0x00: 'nop',
|
||||
0x01: 'ld bc, #d16',
|
||||
0x02: 'ld [bc], a',
|
||||
0x03: 'inc bc',
|
||||
0x04: 'inc b',
|
||||
0x05: 'dec b',
|
||||
0x06: 'ld b, #d8',
|
||||
0x07: 'rlca',
|
||||
0x08: 'ld [#a16], sp',
|
||||
0x09: 'add hl, bc',
|
||||
0x0a: 'ld a, [bc]',
|
||||
0x0b: 'dec bc',
|
||||
0x0c: 'inc c',
|
||||
0x0d: 'dec c',
|
||||
0x0e: 'ld c, #d8',
|
||||
0x0f: 'rrca',
|
||||
|
||||
0x10: 'stop',
|
||||
0x11: 'ld de, #d16',
|
||||
0x12: 'ld [de], a',
|
||||
0x13: 'inc de',
|
||||
0x14: 'inc d',
|
||||
0x15: 'dec d',
|
||||
0x16: 'ld d, #d8',
|
||||
0x17: 'rla',
|
||||
0x18: 'jr #r8',
|
||||
0x19: 'add hl, de',
|
||||
0x1a: 'ld a, [de]',
|
||||
0x1b: 'dec de',
|
||||
0x1c: 'inc e',
|
||||
0x1d: 'dec e',
|
||||
0x1e: 'ld e, #d8',
|
||||
0x1f: 'rra',
|
||||
|
||||
0x20: 'jr nz, #r8',
|
||||
0x21: 'ld hl, #d16',
|
||||
0x22: 'ld [hl+], a',
|
||||
0x23: 'inc hl',
|
||||
0x24: 'inc h',
|
||||
0x25: 'dec h',
|
||||
0x26: 'ld h, #d8',
|
||||
0x27: 'daa',
|
||||
0x28: 'jr z, #r8',
|
||||
0x29: 'add hl, hl',
|
||||
0x2a: 'ld a, [hl+]',
|
||||
0x2b: 'dec hl',
|
||||
0x2c: 'inc l',
|
||||
0x2d: 'dec l',
|
||||
0x2e: 'ld l, #d8',
|
||||
0x2f: 'cpl',
|
||||
|
||||
0x30: 'jr nc, #r8',
|
||||
0x31: 'ld sp, #d16',
|
||||
0x32: 'ld [hl-], a',
|
||||
0x33: 'inc sp',
|
||||
0x34: 'inc [hl]',
|
||||
0x35: 'dec [hl]',
|
||||
0x36: 'ld [hl], #d8',
|
||||
0x37: 'scf',
|
||||
0x38: 'jr c, #r8',
|
||||
0x39: 'add hl, sp',
|
||||
0x3a: 'ld a, [hl-]',
|
||||
0x3b: 'dec sp',
|
||||
0x3c: 'inc a',
|
||||
0x3d: 'dec a',
|
||||
0x3e: 'ld a, #d8',
|
||||
0x3f: 'ccf',
|
||||
|
||||
0x40: 'ld b, b',
|
||||
0x41: 'ld b, c',
|
||||
0x42: 'ld b, d',
|
||||
0x43: 'ld b, e',
|
||||
0x44: 'ld b, h',
|
||||
0x45: 'ld b, l',
|
||||
0x46: 'ld b, [hl]',
|
||||
0x47: 'ld b, a',
|
||||
0x48: 'ld c, b',
|
||||
0x49: 'ld c, c',
|
||||
0x4a: 'ld c, d',
|
||||
0x4b: 'ld c, e',
|
||||
0x4c: 'ld c, h',
|
||||
0x4d: 'ld c, l',
|
||||
0x4e: 'ld c, [hl]',
|
||||
0x4f: 'ld c, a',
|
||||
|
||||
0x50: 'ld d, b',
|
||||
0x51: 'ld d, c',
|
||||
0x52: 'ld d, d',
|
||||
0x53: 'ld d, e',
|
||||
0x54: 'ld d, h',
|
||||
0x55: 'ld d, l',
|
||||
0x56: 'ld d, [hl]',
|
||||
0x57: 'ld d, a',
|
||||
0x58: 'ld e, b',
|
||||
0x59: 'ld e, c',
|
||||
0x5a: 'ld e, d',
|
||||
0x5b: 'ld e, e',
|
||||
0x5c: 'ld e, h',
|
||||
0x5d: 'ld e, l',
|
||||
0x5e: 'ld e, [hl]',
|
||||
0x5f: 'ld e, a',
|
||||
|
||||
0x60: 'ld h, b',
|
||||
0x61: 'ld h, c',
|
||||
0x62: 'ld h, d',
|
||||
0x63: 'ld h, e',
|
||||
0x64: 'ld h, h',
|
||||
0x65: 'ld h, l',
|
||||
0x66: 'ld h, [hl]',
|
||||
0x67: 'ld h, a',
|
||||
0x68: 'ld l, b',
|
||||
0x69: 'ld l, c',
|
||||
0x6a: 'ld l, d',
|
||||
0x6b: 'ld l, e',
|
||||
0x6c: 'ld l, h',
|
||||
0x6d: 'ld l, l',
|
||||
0x6e: 'ld l, [hl]',
|
||||
0x6f: 'ld l, a',
|
||||
|
||||
0x70: 'ld [hl], b',
|
||||
0x71: 'ld [hl], c',
|
||||
0x72: 'ld [hl], d',
|
||||
0x73: 'ld [hl], e',
|
||||
0x74: 'ld [hl], h',
|
||||
0x75: 'ld [hl], l',
|
||||
0x76: 'halt',
|
||||
0x77: 'ld [hl], a',
|
||||
0x78: 'ld a, b',
|
||||
0x79: 'ld a, c',
|
||||
0x7a: 'ld a, d',
|
||||
0x7b: 'ld a, e',
|
||||
0x7c: 'ld a, h',
|
||||
0x7d: 'ld a, l',
|
||||
0x7e: 'ld a, [hl]',
|
||||
0x7f: 'ld a, a',
|
||||
|
||||
0x80: 'add a, b',
|
||||
0x81: 'add a, c',
|
||||
0x82: 'add a, d',
|
||||
0x83: 'add a, e',
|
||||
0x84: 'add a, h',
|
||||
0x85: 'add a, l',
|
||||
0x86: 'add a, [hl]',
|
||||
0x87: 'add a, a',
|
||||
0x88: 'adc a, b',
|
||||
0x89: 'adc a, c',
|
||||
0x8a: 'adc a, d',
|
||||
0x8b: 'adc a, e',
|
||||
0x8c: 'adc a, h',
|
||||
0x8d: 'adc a, l',
|
||||
0x8e: 'adc a, [hl]',
|
||||
0x8f: 'adc a, a',
|
||||
|
||||
0x90: 'sub b',
|
||||
0x91: 'sub c',
|
||||
0x92: 'sub d',
|
||||
0x93: 'sub e',
|
||||
0x94: 'sub h',
|
||||
0x95: 'sub l',
|
||||
0x96: 'sub [hl]',
|
||||
0x97: 'sub a',
|
||||
0x98: 'sbc a, b',
|
||||
0x99: 'sbc a, c',
|
||||
0x9a: 'sbc a, d',
|
||||
0x9b: 'sbc a, e',
|
||||
0x9c: 'sbc a, h',
|
||||
0x9d: 'sbc a, l',
|
||||
0x9e: 'sbc a, [hl]',
|
||||
0x9f: 'sbc a, a',
|
||||
|
||||
0xa0: 'and b',
|
||||
0xa1: 'and c',
|
||||
0xa2: 'and d',
|
||||
0xa3: 'and e',
|
||||
0xa4: 'and h',
|
||||
0xa5: 'and l',
|
||||
0xa6: 'and [hl]',
|
||||
0xa7: 'and a',
|
||||
0xa8: 'xor b',
|
||||
0xa9: 'xor c',
|
||||
0xaa: 'xor d',
|
||||
0xab: 'xor e',
|
||||
0xac: 'xor h',
|
||||
0xad: 'xor l',
|
||||
0xae: 'xor [hl]',
|
||||
0xaf: 'xor a',
|
||||
|
||||
0xb0: 'or b',
|
||||
0xb1: 'or c',
|
||||
0xb2: 'or d',
|
||||
0xb3: 'or e',
|
||||
0xb4: 'or h',
|
||||
0xb5: 'or l',
|
||||
0xb6: 'or [hl]',
|
||||
0xb7: 'or a',
|
||||
0xb8: 'cp b',
|
||||
0xb9: 'cp c',
|
||||
0xba: 'cp d',
|
||||
0xbb: 'cp e',
|
||||
0xbc: 'cp h',
|
||||
0xbd: 'cp l',
|
||||
0xbe: 'cp [hl]',
|
||||
0xbf: 'cp a',
|
||||
|
||||
0xc0: 'ret nz',
|
||||
0xc1: 'pop bc',
|
||||
0xc2: 'jp nz, #a16',
|
||||
0xc3: 'jp #a16',
|
||||
0xc4: 'call nz, #a16',
|
||||
0xc5: 'push bc',
|
||||
0xc6: 'add a, #d8',
|
||||
0xc7: 'rst $00',
|
||||
0xc8: 'ret z',
|
||||
0xc9: 'ret',
|
||||
0xca: 'jp z, #a16',
|
||||
0xcb: gbz80_bitops,
|
||||
0xcc: 'call z, #a16',
|
||||
0xcd: 'call #a16',
|
||||
0xce: 'adc a, #d8',
|
||||
0xcf: 'rst $08',
|
||||
|
||||
0xd0: 'ret nc',
|
||||
0xd1: 'pop de',
|
||||
0xd2: 'jp nc, #a16',
|
||||
# 0xd3
|
||||
0xd4: 'call nc, #a16',
|
||||
0xd5: 'push de',
|
||||
0xd6: 'sub #d8',
|
||||
0xd7: 'rst $10',
|
||||
0xd8: 'ret c',
|
||||
0xd9: 'reti',
|
||||
0xda: 'jp c, #a16',
|
||||
# 0xdb
|
||||
0xdc: 'call c, #a16',
|
||||
# 0xdd
|
||||
0xde: 'sbc a, #d8',
|
||||
0xdf: 'rst $18',
|
||||
|
||||
0xe0: 'ldh [#a8], a',
|
||||
0xe1: 'pop hl',
|
||||
0xe2: 'ld [$ff00+c], a', # XXX table claims 2 but this looks like 1 to me
|
||||
# 0xe3
|
||||
# 0xe4
|
||||
0xe5: 'push hl',
|
||||
0xe6: 'and #d8',
|
||||
0xe7: 'rst $20',
|
||||
0xe8: 'add sp, #r8',
|
||||
0xe9: 'jp [hl]',
|
||||
0xea: 'ld [#a16], a',
|
||||
# 0xeb
|
||||
# 0xec
|
||||
# 0xed
|
||||
0xee: 'xor #d8',
|
||||
0xef: 'rst $28',
|
||||
|
||||
0xf0: 'ldh a, [#a8]',
|
||||
0xf1: 'pop af',
|
||||
0xf2: 'ld a, [$ff00+c]', # XXX table says 1 but this looks like 1 to me
|
||||
0xf3: 'di',
|
||||
# 0xf4
|
||||
0xf5: 'push af',
|
||||
0xf6: 'or #d8',
|
||||
0xf7: 'rst $30',
|
||||
0xf8: 'ld hl, sp+#r8',
|
||||
0xf9: 'ld sp, hl',
|
||||
0xfa: 'ld a, [#a16]',
|
||||
0xfb: 'ei',
|
||||
# 0xfc
|
||||
# 0xfd
|
||||
0xfe: 'cp #d8',
|
||||
0xff: 'rst $38',
|
||||
}
|
||||
|
||||
|
||||
class Atom:
|
||||
is_constant = False
|
||||
is_input = False
|
||||
is_register = False
|
||||
|
||||
def is_compatible_with(self, other):
|
||||
return self == other
|
||||
|
||||
def render(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@attr.s
|
||||
class InputAtom(Atom):
|
||||
name = attr.ib()
|
||||
length = attr.ib()
|
||||
is_input = True
|
||||
|
||||
def is_compatible_with(self, other):
|
||||
# Inputs are compatible with anything that's not a register
|
||||
# TODO does length matter?
|
||||
return isinstance(other, (InputAtom, ConstantAtom))
|
||||
|
||||
def render(self, value=None):
|
||||
if value is None:
|
||||
return '#' + self.name
|
||||
else:
|
||||
return "${:0{width}x}".format(value, width=self.length * 2)
|
||||
|
||||
|
||||
@attr.s
|
||||
class RegisterAtom(Atom):
|
||||
name = attr.ib()
|
||||
is_register = True
|
||||
|
||||
def render(self):
|
||||
return self.name
|
||||
|
||||
|
||||
@attr.s
|
||||
class ConstantAtom(Atom):
|
||||
value = attr.ib()
|
||||
is_constant = True
|
||||
|
||||
def render(self):
|
||||
if self.value < 256:
|
||||
return "${:02x}".format(self.value)
|
||||
else:
|
||||
return "${:04x}".format(self.value)
|
||||
|
||||
|
||||
class Instruction:
|
||||
def __init__(self, syntax, prefix, mnemonic, length, args, inputs):
|
||||
self.syntax = syntax
|
||||
self.prefix = prefix
|
||||
self.mnemonic = mnemonic
|
||||
self.length = length
|
||||
# list of (items_to_sum, is_pointer) tuples
|
||||
self.args = args
|
||||
self.inputs = inputs
|
||||
|
||||
@staticmethod
|
||||
def partial_parse(syntax):
|
||||
mnemonic, *argstrs = re.split('[, ]+', syntax)
|
||||
args = []
|
||||
|
||||
for argstr in argstrs:
|
||||
ptr = False
|
||||
if argstr.startswith('[') and argstr.endswith(']'):
|
||||
ptr = True
|
||||
argstr = argstr[1:-1]
|
||||
|
||||
atoms = []
|
||||
for atomstr in re.split('[+](?=.)', argstr):
|
||||
if atomstr.startswith('#'):
|
||||
atom = InputAtom(atomstr[1:], None)
|
||||
elif atomstr in (
|
||||
'a', 'f', 'b', 'c', 'd', 'e', 'h', 'l',
|
||||
'af', 'bc', 'de', 'hl', 'hl+', 'hl-',
|
||||
'sp', 'pc', 'z', 'nz', 'c', 'nc',
|
||||
):
|
||||
atom = RegisterAtom(atomstr)
|
||||
elif atomstr.startswith('$'):
|
||||
atom = ConstantAtom(int(atomstr[1:], 16))
|
||||
elif atomstr.isdigit():
|
||||
atom = ConstantAtom(int(atomstr))
|
||||
else:
|
||||
raise SyntaxError(
|
||||
"Unrecognized argument {!r} in instruction {!r}"
|
||||
.format(atomstr, syntax))
|
||||
|
||||
atoms.append(atom)
|
||||
|
||||
args.append((atoms, ptr))
|
||||
|
||||
return mnemonic, args
|
||||
|
||||
@classmethod
|
||||
def parse(cls, syntax, prefix):
|
||||
mnemonic, args = cls.partial_parse(syntax)
|
||||
inputs = []
|
||||
|
||||
length = len(prefix)
|
||||
for atoms, is_ptr in args:
|
||||
for atom in atoms:
|
||||
if not atom.is_input:
|
||||
continue
|
||||
if atom.name in ('d16', 'a16'):
|
||||
atom.length = 2
|
||||
elif atom.name in ('d8', 'a8', 'r8'):
|
||||
atom.length = 1
|
||||
else:
|
||||
raise SyntaxError(
|
||||
"Unrecognized input name {}".format(atom.name))
|
||||
|
||||
inputs.append(atom)
|
||||
length += atom.length
|
||||
|
||||
self = cls(syntax, prefix, mnemonic, length, args, inputs)
|
||||
assert self(ignore_inputs=True) == syntax
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return "<{} 0x{}: {}>".format(
|
||||
type(self).__name__,
|
||||
self.prefix.hex(),
|
||||
self(ignore_inputs=True),
|
||||
)
|
||||
|
||||
def __call__(self, *inputs, ignore_inputs=False):
|
||||
inputs = list(inputs)
|
||||
if not ignore_inputs:
|
||||
if len(inputs) != len(self.inputs):
|
||||
raise TypeError(
|
||||
"{} needs {} inputs, got {}"
|
||||
.format(self.syntax, len(self.inputs), len(inputs)))
|
||||
|
||||
args = []
|
||||
for atoms, is_ptr in self.args:
|
||||
atomstrs = []
|
||||
for atom in atoms:
|
||||
if not ignore_inputs and atom.is_input:
|
||||
atomstrs.append(atom.render(inputs.pop(0)))
|
||||
else:
|
||||
atomstrs.append(atom.render())
|
||||
expr = '+'.join(atomstrs)
|
||||
if is_ptr:
|
||||
expr = '[' + expr + ']'
|
||||
args.append(expr)
|
||||
|
||||
out = self.mnemonic
|
||||
if args:
|
||||
out = out + ' ' + ', '.join(args)
|
||||
return out
|
||||
|
||||
def match_inputs(self, mnemonic, args):
|
||||
if self.mnemonic != mnemonic:
|
||||
return
|
||||
if len(self.args) != len(args):
|
||||
return
|
||||
|
||||
# Compare args
|
||||
input_pairs = []
|
||||
for (atoms1, ptr1), (atoms2, ptr2) in zip(self.args, args):
|
||||
if ptr1 != ptr2:
|
||||
return
|
||||
if len(atoms1) != len(atoms2):
|
||||
return
|
||||
# TODO technically, A+B is the same as B+A, but the lists are het
|
||||
# so i can't sort
|
||||
# TODO also, constant folding could make A+B the same as C
|
||||
for atom1, atom2 in zip(atoms1, atoms2):
|
||||
if not atom1.is_compatible_with(atom2):
|
||||
return
|
||||
if atom1.is_input:
|
||||
input_pairs.append((atom1, atom2))
|
||||
|
||||
return input_pairs
|
||||
|
||||
|
||||
class InstructionSet:
|
||||
def __init__(self, instructions):
|
||||
self.instructions = {}
|
||||
self.mnemonics = defaultdict(set)
|
||||
self._load_instructions(instructions)
|
||||
|
||||
def _load_instructions(self, instructions, *, prefix=b''):
|
||||
for n, syntax in instructions.items():
|
||||
byte = bytes([n])
|
||||
if isinstance(syntax, dict):
|
||||
# Nested args, for the bitops instruction
|
||||
self._load_instructions(syntax, prefix=byte)
|
||||
else:
|
||||
instr = Instruction.parse(syntax, prefix + byte)
|
||||
self.instructions[prefix + byte] = instr
|
||||
self.mnemonics[instr.mnemonic].add(instr)
|
||||
|
||||
|
||||
gbz80 = InstructionSet(gbz80_instructions)
|
||||
|
||||
needle = """push bc
|
||||
push hl
|
||||
ld a, [#wd11e]
|
||||
dec a
|
||||
ld hl, #PokedexOrder
|
||||
ld b, 0
|
||||
ld c, a
|
||||
add hl, bc
|
||||
ld a, [hl]
|
||||
ld [#wd11e], a
|
||||
pop hl
|
||||
pop bc
|
||||
ret"""
|
||||
"""
|
||||
\xc5
|
||||
\xe5
|
||||
\xfa (..)
|
||||
\x3d
|
||||
\x21 (..)
|
||||
\x06 \x00
|
||||
\x4f
|
||||
\x09
|
||||
\x7e
|
||||
\xea \1
|
||||
\xe1
|
||||
\xc1
|
||||
\xc9
|
||||
"""
|
||||
haystack = b'\xc5\xe5\xfaXY\x3d\x21ZW\x06\x00\x4f\x09\x7e\xeaXY\xe1\xc1\xc9'
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Disassemble
|
||||
|
||||
def disassemble(haystack):
|
||||
i = 0
|
||||
while i < len(haystack):
|
||||
for l in range(2):
|
||||
prefix = haystack[i:i+l+1]
|
||||
if prefix in gbz80.instructions:
|
||||
instr = gbz80.instructions[prefix]
|
||||
break
|
||||
else:
|
||||
raise SyntaxError
|
||||
|
||||
i += len(prefix)
|
||||
inputs = []
|
||||
for inp in instr.inputs:
|
||||
inputs.append(int.from_bytes(
|
||||
haystack[i:i + inp.length], byteorder='little'))
|
||||
i += inp.length
|
||||
|
||||
print(instr(*inputs))
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Pattern match
|
||||
|
||||
def find_code(haystack, needle, **kwargs):
|
||||
# TODO error if something in kwargs isn't a pattern input?
|
||||
# TODO the return value here is goofy
|
||||
# TODO maybe use finditer and yield instead
|
||||
pattern_chunks = []
|
||||
input_table = OrderedDict()
|
||||
matched_instructions = []
|
||||
for instruction in needle.splitlines():
|
||||
instruction = re.sub(';.*', '', instruction).strip()
|
||||
if not instruction:
|
||||
continue
|
||||
mnemonic, args = Instruction.partial_parse(instruction)
|
||||
candidates = gbz80.mnemonics[mnemonic]
|
||||
for candidate in candidates:
|
||||
inputs = candidate.match_inputs(mnemonic, args)
|
||||
if inputs is not None:
|
||||
break
|
||||
else:
|
||||
raise SyntaxError
|
||||
|
||||
instr = candidate
|
||||
pattern_chunks.append(re.escape(instr.prefix))
|
||||
pattern_atoms = []
|
||||
for instr_input, pattern_atom in inputs:
|
||||
pattern_atoms.append(pattern_atom)
|
||||
if pattern_atom.is_constant:
|
||||
pattern_chunks.append(re.escape(
|
||||
pattern_atom.value.to_bytes(
|
||||
instr_input.length, byteorder='little')))
|
||||
elif pattern_atom.name in input_table:
|
||||
pattern_chunks.append(input_table[pattern_atom.name])
|
||||
else:
|
||||
if pattern_atom.name in kwargs:
|
||||
inner_pattern = re.escape(
|
||||
kwargs[pattern_atom.name].to_bytes(
|
||||
instr_input.length, byteorder='little'))
|
||||
else:
|
||||
inner_pattern = b'.' * instr_input.length
|
||||
|
||||
group_name = pattern_atom.name.encode('ascii')
|
||||
input_table[pattern_atom.name] = b'(?P=%b)' % (group_name,)
|
||||
pattern_chunks.append(b'(?P<%b>%b)' % (group_name, inner_pattern))
|
||||
matched_instructions.append((instr, pattern_atoms))
|
||||
|
||||
pattern = b''.join(pattern_chunks)
|
||||
|
||||
m = re.search(pattern, haystack, flags=re.DOTALL)
|
||||
if m:
|
||||
matched_inputs = {}
|
||||
for inp in input_table:
|
||||
matched_inputs[inp] = int.from_bytes(
|
||||
m.group(inp), byteorder='little')
|
||||
|
||||
for instr, pattern_atoms in matched_instructions:
|
||||
inputs = []
|
||||
for atom in pattern_atoms:
|
||||
if atom.is_constant:
|
||||
inputs.append(atom.value)
|
||||
else:
|
||||
inputs.append(matched_inputs[atom.name])
|
||||
|
||||
return m, matched_inputs
|
||||
else:
|
||||
return
|
|
@ -14,13 +14,13 @@ import hashlib
|
|||
import io
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
||||
from camel import Camel
|
||||
from classtools import reify
|
||||
from construct import *
|
||||
|
||||
from pokedex.extract.lib.gbz80 import find_code
|
||||
import pokedex.schema as schema
|
||||
|
||||
# TODO set this up to colorcode and use {} formatting
|
||||
|
@ -1167,8 +1167,6 @@ pokemon_struct = Struct(
|
|||
Padding(1),
|
||||
)
|
||||
|
||||
pokemon_name_struct = PokemonCString('pokemon_name', 10)
|
||||
|
||||
|
||||
evos_moves_struct = Struct(
|
||||
'evos_moves',
|
||||
|
@ -1281,28 +1279,17 @@ class RBYCart:
|
|||
Return a dict of raw file offsets. The keys are the names used in the
|
||||
pokered project.
|
||||
"""
|
||||
# The base stats are always in the same place in RBY, and only slightly
|
||||
# off in RG. Not sure why! But it hopefully means recompilation
|
||||
# doesn't affect them.
|
||||
addresses = {
|
||||
# These ones have, thusfar, defied automatic detection, as they're
|
||||
# just part of a big old block of data — so I can't just look for
|
||||
# code nearby.
|
||||
# TODO these are for rby; fix for rg, and maybe y?
|
||||
'BaseStats': unbank('0E:43DE'),
|
||||
'MewBaseStats': unbank('01:425B'),
|
||||
}
|
||||
|
||||
# For everything else, the general approach is to find some assembly
|
||||
# code that appears just before the data of interest. It's pretty
|
||||
# hacky, but since translators (and even modders) would have little
|
||||
# reason to rearrange functions or inject new ones in these odd places,
|
||||
# it ought to work well enough. And it's better than ferreting out and
|
||||
# hard-coding piles of addresses.
|
||||
# The ideal approach is to find some assembly code that appears just
|
||||
# before the data of interest. It's pretty hacky, but since
|
||||
# translators (and even modders) would have little reason to rearrange
|
||||
# functions or inject new ones in these odd places, it ought to work
|
||||
# well enough. And it's better than ferreting out and hard-coding
|
||||
# piles of addresses.
|
||||
# The only hard part is that assembly code that contains an address
|
||||
# won't work, since that address will also vary per game.
|
||||
# Each of the landmarks used here appears in every official cartridge
|
||||
# exactly once.
|
||||
addresses = {}
|
||||
|
||||
# This is an entire function used by the Pokédex and which immediately
|
||||
# precedes all the flavor text.
|
||||
|
@ -1322,6 +1309,72 @@ class RBYCart:
|
|||
raise CartDetectionError("Can't find evolution and moveset table")
|
||||
addresses['EvosMovesPointerTable'] = idx + len(asm_WriteMonMoves_ShiftMoveData) + 5
|
||||
|
||||
# Several lists of names are accessed by a single function, which looks
|
||||
# through a list of pointers to find the right set of names to use.
|
||||
# That's great news for me: I can just grab all of those delicious
|
||||
# pointers at once. Here's an excerpt from GetName.
|
||||
match = find_code(self.data, '''
|
||||
inc d
|
||||
;.skip
|
||||
ld hl, #NamePointers
|
||||
add hl,de
|
||||
ld a,[hl+]
|
||||
ldh [$96],a
|
||||
ld a,[hl]
|
||||
ldh [$95],a
|
||||
ldh a,[$95]
|
||||
ld h,a
|
||||
ldh a,[$96]
|
||||
ld l,a
|
||||
ld a,[#wd0b5]
|
||||
ld b,a
|
||||
ld c,0
|
||||
;.nextName
|
||||
ld d,h
|
||||
ld e,l
|
||||
;.nextChar
|
||||
ld a,[hl+]
|
||||
cp $50 ; terminator @, encoded
|
||||
''')
|
||||
if not match:
|
||||
raise CartDetectionError("Can't find name array")
|
||||
rem, inputs = match
|
||||
start = inputs['NamePointers']
|
||||
name_pointers = Array(7, ULInt16('dummy')).parse(
|
||||
self.data[start:start + 14])
|
||||
# One downside to the Game Boy memory structure is that banks are
|
||||
# not stored anywhere near their corresponding addresses. Most
|
||||
# bank numbers are hardcoded here, but Pokémon names are in a different
|
||||
# bank in Japanese games, so we've gotta scrape the bank too...
|
||||
match = find_code(self.data, '''
|
||||
;GetMonName::
|
||||
push hl
|
||||
ldh a,[#H_LOADEDROMBANK]
|
||||
push af
|
||||
ld a,#BANK_MonsterNames
|
||||
ldh [#H_LOADEDROMBANK],a
|
||||
ld [#MBC1RomBank],a
|
||||
ld a,[#wd11e]
|
||||
dec a
|
||||
ld hl,#MonsterNames
|
||||
''',
|
||||
H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF
|
||||
MBC1RomBank=0x2000,
|
||||
MonsterNames=name_pointers[0]
|
||||
)
|
||||
if not match:
|
||||
raise CartDetectionError("Can't find Pokémon names")
|
||||
rem, inputs = match
|
||||
|
||||
addresses['MonsterNames'] = unbank(
|
||||
inputs['BANK_MonsterNames'], name_pointers[0])
|
||||
addresses['MoveNames'] = unbank(0x2C, name_pointers[1])
|
||||
# 2: UnusedNames (unused, obviously)
|
||||
addresses['ItemNames'] = unbank(0x01, name_pointers[3])
|
||||
# 4: wPartyMonOT (only useful while the game is running)
|
||||
# 5: wEnemyMonOT (only useful while the game is running)
|
||||
addresses['TrainerNames'] = unbank(0x0E, name_pointers[6])
|
||||
|
||||
# Finding TMs is a bit harder. They come right after a function for
|
||||
# looking up a TM number, which is very short and very full of
|
||||
# addresses. So here's a regex.
|
||||
|
@ -1330,26 +1383,29 @@ class RBYCart:
|
|||
# In English it is, unsurprisingly, 0xD11E.
|
||||
# `TechnicalMachines` is the address we're looking for, which should
|
||||
# immediately follow what this matches.
|
||||
asm_TMToMove_rx = re.compile(rb'''
|
||||
\xfa (..) # ld a, [wd11e]
|
||||
\x3d # dec a
|
||||
\x21 (..) # ld hl, TechnicalMachines
|
||||
\x06 \x00 # ld b, $0
|
||||
\x4f # ld c, a
|
||||
\x09 # add hl, bc
|
||||
\x7e # ld a, [hl]
|
||||
\xea \1 # ld [wd11e], a
|
||||
\xc9 # ret
|
||||
''', flags=re.DOTALL | re.VERBOSE)
|
||||
for match in asm_TMToMove_rx.finditer(self.data):
|
||||
matched_addr = ULInt16('...').parse(match.group(2))
|
||||
tentative_addr = match.end()
|
||||
match = find_code(self.data, '''
|
||||
ld a, [#wd11e]
|
||||
dec a
|
||||
ld hl, #TechnicalMachines
|
||||
ld b, $0
|
||||
ld c, a
|
||||
add hl, bc
|
||||
ld a, [hl]
|
||||
ld [#wd11e], a
|
||||
ret
|
||||
''')
|
||||
if match:
|
||||
rem, inputs = match
|
||||
# TODO this should mayybe also check that the address immediately follows this code
|
||||
matched_addr = inputs['TechnicalMachines']
|
||||
tentative_addr = rem.end()
|
||||
# Remember, addresses don't include the bank!
|
||||
_, banked_addr = bank(tentative_addr)
|
||||
if matched_addr == banked_addr:
|
||||
asm_wd11e_addr = match.group(1)
|
||||
asm_wd11e_addr = inputs['wd11e']
|
||||
addresses['TechnicalMachines'] = tentative_addr
|
||||
break
|
||||
else:
|
||||
raise RuntimeError
|
||||
# TODO should there really be more than one match?
|
||||
else:
|
||||
raise CartDetectionError("Can't find technical machines list")
|
||||
|
@ -1359,61 +1415,107 @@ class RBYCart:
|
|||
# These are almost immediately after the Pokédex entries themselves,
|
||||
# but this actually seems easier than figuring out where a table of
|
||||
# pointers ends.
|
||||
asm_IndexToPokedex_rx = re.compile(rb'''
|
||||
\xc5 # push bc
|
||||
\xe5 # push hl
|
||||
\xfa (..) # ld a,[wd11e]
|
||||
\x3d # dec a
|
||||
\x21 (..) # ld hl,PokedexOrder
|
||||
\x06 \x00 # ld b,0
|
||||
\x4f # ld c,a
|
||||
\x09 # add hl,bc
|
||||
\x7e # ld a,[hl]
|
||||
\xea \1 # ld [wd11e],a
|
||||
\xe1 # pop hl
|
||||
\xc1 # pop bc
|
||||
\xc9 # ret
|
||||
''', flags=re.DOTALL | re.VERBOSE)
|
||||
for match in asm_IndexToPokedex_rx.finditer(self.data):
|
||||
matched_addr = ULInt16('...').parse(match.group(2))
|
||||
tentative_addr = match.end()
|
||||
match = find_code(self.data, '''
|
||||
push bc
|
||||
push hl
|
||||
ld a, [#wd11e]
|
||||
dec a
|
||||
ld hl, #PokedexOrder
|
||||
ld b, 0
|
||||
ld c, a
|
||||
add hl, bc
|
||||
ld a, [hl]
|
||||
ld [#wd11e], a
|
||||
pop hl
|
||||
pop bc
|
||||
ret
|
||||
''', wd11e=asm_wd11e_addr)
|
||||
if match:
|
||||
rem, inputs = match
|
||||
matched_addr = inputs['PokedexOrder']
|
||||
tentative_addr = rem.end()
|
||||
# Remember, addresses don't include the bank!
|
||||
_, banked_addr = bank(tentative_addr)
|
||||
if matched_addr == banked_addr and asm_wd11e_addr == match.group(1):
|
||||
if matched_addr == banked_addr:
|
||||
addresses['PokedexOrder'] = tentative_addr
|
||||
break
|
||||
else:
|
||||
raise RuntimeError
|
||||
else:
|
||||
raise CartDetectionError("Can't find Pokédex order")
|
||||
|
||||
# This is assembly code that appears near the end of a function called
|
||||
# WaitForSoundToFinish.
|
||||
end_of_WaitForSoundToFinish = bytes.fromhex('afb6 23b6 2323 b6')
|
||||
try:
|
||||
idx = self.data.index(end_of_WaitForSoundToFinish)
|
||||
except ValueError:
|
||||
raise CartDetectionError("Can't find name array")
|
||||
# There are a couple more bytes in the function, but they involve an
|
||||
# address so they can't be searched for. Red/Green/Blue have four;
|
||||
# Yellow has an extra 'and', which is annoying, but at least easy to
|
||||
# handle.
|
||||
start = idx + len(end_of_WaitForSoundToFinish)
|
||||
if self.data[start] == 0xA7:
|
||||
# Yellow; skip one more byte
|
||||
start += 1
|
||||
start += 4
|
||||
|
||||
name_pointers = Array(7, ULInt16('dummy')).parse(self.data[start:start + 14])
|
||||
# One downside to the Game Boy memory structure is that banks are not
|
||||
# stored anywhere near their corresponding addresses, so the bank
|
||||
# numbers are hardcoded here. They're fairly unlikely to change
|
||||
# between games. Right? Probably?
|
||||
addresses['MonsterNames'] = unbank(0x07, name_pointers[0])
|
||||
addresses['MoveNames'] = unbank(0x2C, name_pointers[1])
|
||||
# 2: UnusedNames (unused, obviously)
|
||||
addresses['ItemNames'] = unbank(0x01, name_pointers[3])
|
||||
# 4: wPartyMonOT (only useful while the game is running)
|
||||
# 5: wEnemyMonOT (only useful while the game is running)
|
||||
addresses['TrainerNames'] = unbank(0x0E, name_pointers[6])
|
||||
# Ah, but then, we have base stats. These don't have code nearby;
|
||||
# they're just stuck immediately after moves. Except in R/G, where
|
||||
# they appear /before/ moves! And we don't know what version we're
|
||||
# running yet, because the addresses detected in this method are used
|
||||
# for language detection. Hmm.
|
||||
# Here's plan B: look for the function that /loads/ base stats, and
|
||||
# scrape the address out of it. This function is a bit hairy; I've had
|
||||
# to expand some of pokered's macros and rewrite the jumps to something
|
||||
# that the rudimentary code matcher can understand.
|
||||
match = find_code(self.data, '''
|
||||
ldh a, [#H_LOADEDROMBANK]
|
||||
push af
|
||||
ld a, #BANK_BaseStats
|
||||
ldh [#H_LOADEDROMBANK], a
|
||||
ld [#MBC1RomBank], a
|
||||
push bc
|
||||
push de
|
||||
push hl
|
||||
ld a, [#wd11e]
|
||||
push af
|
||||
ld a,[#wd0b5]
|
||||
ld [#wd11e],a
|
||||
ld de,#FossilKabutopsPic
|
||||
ld b,$66 ; size of Kabutops fossil and Ghost sprites
|
||||
cp #FOSSIL_KABUTOPS ; Kabutops fossil
|
||||
jr z,#specialID1
|
||||
ld de,#GhostPic
|
||||
cp #MON_GHOST ; Ghost
|
||||
jr z,#specialID2
|
||||
ld de,#FossilAerodactylPic
|
||||
ld b,$77 ; size of Aerodactyl fossil sprite
|
||||
cp #FOSSIL_AERODACTYL ; Aerodactyl fossil
|
||||
jr z,#specialID3
|
||||
cp #MEW
|
||||
jr z,#mew
|
||||
ld a, #IndexToPokedexPredef
|
||||
call #IndexToPokedex ; convert pokemon ID in [wd11e] to pokedex number
|
||||
ld a,[#wd11e]
|
||||
dec a
|
||||
ld bc, #MonBaseStatsLength
|
||||
ld hl, #BaseStats
|
||||
call #AddNTimes
|
||||
ld de, #wMonHeader
|
||||
ld bc, #MonBaseStatsLength
|
||||
call #CopyData
|
||||
jr #done1
|
||||
;.specialID
|
||||
ld hl, #wMonHSpriteDim
|
||||
ld [hl], b ; write sprite dimensions
|
||||
inc hl
|
||||
ld [hl], e ; write front sprite pointer
|
||||
inc hl
|
||||
ld [hl], d
|
||||
jr #done2
|
||||
;.mew
|
||||
ld hl, #MewBaseStats
|
||||
ld de, #wMonHeader
|
||||
ld bc, #MonBaseStatsLength
|
||||
ld a, #BANK_MewBaseStats
|
||||
call #FarCopyData
|
||||
''',
|
||||
# These are constants; I left them in the above code for clarity
|
||||
H_LOADEDROMBANK=0xB8, # full address is $FFB8; ldh adds the $FF
|
||||
MBC1RomBank=0x2000,
|
||||
# This was scraped previously
|
||||
wd11e=asm_wd11e_addr,
|
||||
)
|
||||
if match:
|
||||
rem, inputs = match
|
||||
addresses['BaseStats'] = unbank(inputs['BANK_BaseStats'], inputs['BaseStats'])
|
||||
addresses['MewBaseStats'] = unbank(inputs['BANK_MewBaseStats'], inputs['MewBaseStats'])
|
||||
else:
|
||||
raise CartDetectionError("Can't find base stats")
|
||||
|
||||
return addresses
|
||||
|
||||
|
@ -1612,7 +1714,12 @@ class RBYCart:
|
|||
ret = [None] * self.NUM_POKEMON
|
||||
|
||||
self.stream.seek(self.addrs['MonsterNames'])
|
||||
for index, pokemon_name in enumerate(Array(self.max_pokemon_index, pokemon_name_struct).parse_stream(self.stream), start=1):
|
||||
# TODO i don't like this, but they don't have explicit terminators...
|
||||
if self.language == 'ja':
|
||||
name_length = 5
|
||||
else:
|
||||
name_length = 10
|
||||
for index, pokemon_name in enumerate(Array(self.max_pokemon_index, PokemonCString('...', name_length)).parse_stream(self.stream), start=1):
|
||||
try:
|
||||
id = self.pokedex_order[index]
|
||||
except KeyError:
|
||||
|
@ -1631,7 +1738,6 @@ class RBYCart:
|
|||
def pokemon_records(self):
|
||||
"""List of pokemon_structs."""
|
||||
self.stream.seek(self.addrs['BaseStats'])
|
||||
print(self.stream.read(100).hex())
|
||||
records = Array(self.NUM_POKEMON - 1, pokemon_struct).parse_stream(self.stream)
|
||||
# Mew's data is, awkwardly, stored separately
|
||||
self.stream.seek(self.addrs['MewBaseStats'])
|
||||
|
|
Loading…
Reference in a new issue