import struct # ported directly from the PalmDoc Perl library # http://kobesearch.cpan.org/htdocs/EBook-Tools/EBook/Tools/PalmDoc.pm.html def uncompress_lz77(data): length = len(data); offset = 0; # Current offset into data # char; # Character being examined # ord; # Ordinal of $char # lz77; # 16-bit Lempel-Ziv 77 length-offset pair # lz77offset; # LZ77 offset # lz77length; # LZ77 length # lz77pos; # Position inside $lz77length text = ''; # Output (uncompressed) text # textlength; # Length of uncompressed text during LZ77 pass # textpos; # Position inside $text during LZ77 pass while offset < length: # char = substr($data,$offset++,1); char = data[offset]; offset += 1; ord_ = ord(char); # print " ".join([repr(char), hex(ord_)]) # The long if-elsif chain is the best logic for $ord handling ## no critic (Cascading if-elsif chain) if (ord_ == 0): # Nulls are literal text += char; elif (ord_ <= 8): # Next $ord bytes are literal text += data[offset:offset+ord_] # text .=substr($data,$offset,ord); offset += ord_; elif (ord_ <= 0x7f): # Values from 0x09 through 0x7f are literal text += char; elif (ord_ <= 0xbf): # Data is LZ77-compressed # From Wikipedia: # "A length-distance pair is always encoded by a two-byte # sequence. Of the 16 bits that make up these two bytes, # 11 bits go to encoding the distance, 3 go to encoding # the length, and the remaining two are used to make sure # the decoder can identify the first byte as the beginning # of such a two-byte sequence." offset += 1; if (offset > len(data)): print("WARNING: offset to LZ77 bits is outside of the data: %d" % offset); return text; lz77, = struct.unpack('>H', data[offset-2:offset]) # Leftmost two bits are ID bits and need to be dropped lz77 &= 0x3fff; # Length is rightmost 3 bits + 3 lz77length = (lz77 & 0x0007) + 3; # Remaining 11 bits are offset lz77offset = lz77 >> 3; if (lz77offset < 1): print("WARNING: LZ77 decompression offset is invalid!"); return text; # Getting text from the offset is a little tricky, because # in theory you can be referring to characters you haven't # actually decompressed yet. You therefore have to check # the reference one character at a time. textlength = len(text); for lz77pos in range(lz77length): # for($lz77pos = 0; $lz77pos < $lz77length; $lz77pos++) textpos = textlength - lz77offset; if (textpos < 0): print("WARNING: LZ77 decompression reference is before"+ " beginning of text! %x" % lz77); return; text += text[textpos:textpos+1]; #text .= substr($text,$textpos,1); textlength+=1; else: # 0xc0 - 0xff are single characters (XOR 0x80) preceded by # a space text += ' ' + chr(ord_ ^ 0x80); return text;