From 2631d36963dd285bddf63016387e8dbd4a13f717 Mon Sep 17 00:00:00 2001 From: "Eevee (Lexy Munroe)" Date: Thu, 26 Jan 2017 16:26:50 -0800 Subject: [PATCH] Add an experimental lazy CompressedStream that actually seems to work Unfortunately it may not make any practical difference, since a compressed PC file still needs to be fully decompressed just to check the magic numbers of its subfiles. May revert this later, I dunno. --- pokedex/extract/lib/compressed.py | 142 ++++++++++++++++++++++++++++++ pokedex/extract/lib/garc.py | 51 +++-------- 2 files changed, 153 insertions(+), 40 deletions(-) create mode 100644 pokedex/extract/lib/compressed.py diff --git a/pokedex/extract/lib/compressed.py b/pokedex/extract/lib/compressed.py new file mode 100644 index 0000000..c195efe --- /dev/null +++ b/pokedex/extract/lib/compressed.py @@ -0,0 +1,142 @@ +"""Substreams that can handle compressed data.""" +import struct + +from .base import Substream + + +class DecompressionError(ValueError): + pass + + +class CompressedStream: + def __init__(self, stream): + self.stream = stream + self.data = bytearray() + self.pos = 0 + self._read_header() + + def __len__(self): + return self.length + + def read(self, n=-1): + maxread = self.length - self.pos + if n < 0 or 0 <= maxread < n: + n = maxread + self._ensure_bytes(self.pos + n) + data = self.data[self.pos:self.pos + n] + self.pos += n + return data + + def seek(self, offset, whence=0): + if whence == 1: + offset += self.pos + elif whence == 2: + offset += self.length + offset = max(offset, 0) + if self.length >= 0: + offset = min(offset, self.length) + self.pos = offset + + def tell(self): + return self.pos + + def peek(self, n): + pos = self.tell() + maxread = self.length - self.pos + data = self.read(min(maxread, n)) + self.seek(pos) + return data + + def unpack(self, fmt): + """Unpacks a struct format from the current position in the stream.""" + data = self.read(struct.calcsize(fmt)) + return struct.unpack(fmt, data) + + def slice(self, offset, length=-1): + # TODO limit or warn if length is too long for this slice? + raise RuntimeError + return Substream(self, offset, length) + + +def _bits(byte): + return ((byte >> 7) & 1, + (byte >> 6) & 1, + (byte >> 5) & 1, + (byte >> 4) & 1, + (byte >> 3) & 1, + (byte >> 2) & 1, + (byte >> 1) & 1, + (byte) & 1) + + +class LZSS11CompressedStream(CompressedStream): + def _read_header(self): + header = self.stream.read(4) + self.compressed_pos = self.stream.tell() + assert header[0] == 0x11 + self.length, = struct.unpack('> 4 + + if indicator == 0: + # 8 bit count, 12 bit disp + # indicator is 0, don't need to mask b + count = (b << 4) + b = readbyte() + count += b >> 4 + count += 0x11 + elif indicator == 1: + # 16 bit count, 12 bit disp + count = ((b & 0xf) << 12) + (readbyte() << 4) + b = readbyte() + count += b >> 4 + count += 0x111 + else: + # indicator is count (4 bits), 12 bit disp + count = indicator + count += 1 + + disp = ((b & 0xf) << 8) + readbyte() + disp += 1 + + try: + for _ in range(count): + writebyte(self.data[-disp]) + except IndexError: + # FIXME `it` no longer exists, need len of substream + raise DecompressionError(count, disp, len(self.data), len(self.stream), self.compressed_pos, self.stream.tell()) + else: + raise DecompressionError(flag) + + if needed <= len(self.data): + break + + self.compressed_pos = self.stream.tell() + + # FIXME check this once we hit eof + #if len(self.data) != decompressed_size: + # raise DecompressionError( + # "decompressed size does not match the expected size") diff --git a/pokedex/extract/lib/garc.py b/pokedex/extract/lib/garc.py index 4eeadb0..358b6f7 100644 --- a/pokedex/extract/lib/garc.py +++ b/pokedex/extract/lib/garc.py @@ -94,7 +94,17 @@ class GARCEntry(object): def __getitem__(self, i): start, length = self.slices[i] ss = self.stream.slice(start, length) - if ss.peek(1) in b'\x10\x11': + peek = ss.peek(1) + if peek == b'\x11': + from .compressed import DecompressionError, LZSS11CompressedStream + decompressor = LZSS11CompressedStream(ss) + try: + decompressor.peek(256) + except DecompressionError: + return ss + else: + return decompressor + elif ss.peek(1) in b'\x10\x11': # XXX this sucks but there's no real way to know for sure whether # data is compressed or not. maybe just bake this into the caller # and let them deal with it, same way we do with text decoding? @@ -116,45 +126,6 @@ class GARCEntry(object): return len(self.slices) -class CompressedStream: - def __init__(self, stream): - self.stream = stream - header = stream.read(4) - stream.seek(0) - assert header[0] in b'\x10\x11' - self.length, = struct.unpack('