1
2 """
3 A stream friendly, simple compression library, built around
4 iterators. See L{compress} and L{decompress} for the easiest way to
5 get started.
6
7 After the TIFF implementation of LZW, as described at
8 U{http://www.fileformat.info/format/tiff/corion-lzw.htm}
9
10
11 In an even-nuttier-shell, lzw compresses input bytes with integer
12 codes. Starting with codes 0-255 that code to themselves, and two
13 control codes, we work our way through a stream of bytes. When we
14 encounter a pair of codes c1,c2 we add another entry to our code table
15 with the lowest available code and the value value(c1) + value(c2)[0]
16
17 Of course, there are details :)
18
19 The Details
20 ===========
21
22 Our control codes are
23
24 - CLEAR_CODE (codepoint 256). When this code is encountered, we flush
25 the codebook and start over.
26 - END_OF_INFO_CODE (codepoint 257). This code is reserved for
27 encoder/decoders over the integer codepoint stream (like the
28 mechanical bit that unpacks bits into codepoints)
29
30 When dealing with bytes, codes are emitted as variable
31 length bit strings packed into the stream of bytes.
32
33 codepoints are written with varying length
34 - initially 9 bits
35 - at 512 entries 10 bits
36 - at 1025 entries at 11 bits
37 - at 2048 entries 12 bits
38 - with max of 4095 entries in a table (including Clear and EOI)
39
40 code points are stored with their MSB in the most significant bit
41 available in the output character.
42
43 >>> import lzw
44 >>>
45 >>> mybytes = lzw.readbytes("README.txt")
46 >>> lessbytes = lzw.compress(mybytes)
47 >>> newbytes = b"".join(lzw.decompress(lessbytes))
48 >>> oldbytes = b"".join(lzw.readbytes("README.txt"))
49 >>> oldbytes == newbytes
50 True
51
52
53 """
54
55 __author__ = "Joe Bowers"
56 __license__ = "MIT License"
57 __version__ = "0.01"
58 __status__ = "Development"
59 __email__ = "joerbowers@gmail.com"
60 __url__ = "http://www.joe-bowers.com/static/lzw"
61
62 import struct
63 import itertools
64
65
66 CLEAR_CODE = 256
67 END_OF_INFO_CODE = 257
68
69 DEFAULT_MIN_BITS = 9
70 DEFAULT_MAX_BITS = 12
71
72
73
74
76 """
77 Given an iterable of bytes, returns a (hopefully shorter) iterable
78 of bytes that you can store in a file or pass over the network or
79 what-have-you, and later use to get back your original bytes with
80 L{decompress}. This is the best place to start using this module.
81 """
82 encoder = ByteEncoder()
83 return encoder.encodetobytes(plaintext_bytes)
84
85
87 """
88 Given an iterable of bytes that were the result of a call to
89 L{compress}, returns an iterator over the uncompressed bytes.
90 """
91 decoder = ByteDecoder()
92 return decoder.decodefrombytes(compressed_bytes)
93
94
95
96
97
99 """
100 Takes a stream of uncompressed bytes and produces a stream of
101 compressed bytes, usable by L{ByteDecoder}. Combines an L{Encoder}
102 with a L{BitPacker}.
103
104
105 >>> import lzw
106 >>>
107 >>> enc = lzw.ByteEncoder(12)
108 >>> bigstr = b"gabba gabba yo gabba gabba gabba yo gabba gabba gabba yo gabba gabba gabba yo"
109 >>> encoding = enc.encodetobytes(bigstr)
110 >>> encoded = b"".join( b for b in encoding )
111 >>> encoded
112 '3\\x98LF#\\x08\\x82\\x05\\x04\\x83\\x1eM\\xf0x\\x1c\\x16\\x1b\\t\\x88C\\xe1q(4"\\x1f\\x17\\x85C#1X\\xec.\\x00'
113 >>>
114 >>> dec = lzw.ByteDecoder()
115 >>> decoding = dec.decodefrombytes(encoded)
116 >>> decoded = b"".join(decoding)
117 >>> decoded == bigstr
118 True
119
120 """
121
123 """
124 max_width is the maximum width in bits we want to see in the
125 output stream of codepoints.
126 """
127 self._encoder = Encoder(max_code_size=2**max_width)
128 self._packer = BitPacker(initial_code_size=self._encoder.code_size())
129
130
132 """
133 Returns an iterator of bytes, adjusting our packed width
134 between minwidth and maxwidth when it detects an overflow is
135 about to occur. Dual of L{ByteDecoder.decodefrombytes}.
136 """
137 codepoints = self._encoder.encode(bytesource)
138 codebytes = self._packer.pack(codepoints)
139
140 return codebytes
141
142
144 """
145 Decodes, combines bit-unpacking and interpreting a codepoint
146 stream, suitable for use with bytes generated by
147 L{ByteEncoder}.
148
149 See L{ByteDecoder} for a usage example.
150 """
155
157 """
158 Given an iterator over BitPacked, Encoded bytes, Returns an
159 iterator over the uncompressed bytes. Dual of
160 L{ByteEncoder.encodetobytes}. See L{ByteEncoder} for an
161 example of use.
162 """
163 codepoints = self._unpacker.unpack(bytesource)
164 clearbytes = self._decoder.decode(codepoints)
165
166 return clearbytes
167
168
170 """
171 Translates a stream of lzw codepoints into a variable width packed
172 stream of bytes, for use by L{BitUnpacker}. One of a (potential)
173 set of encoders for a stream of LZW codepoints, intended to behave
174 as closely to the TIFF variable-width encoding scheme as closely
175 as possible.
176
177 The inbound stream of integer lzw codepoints are packed into
178 variable width bit fields, starting at the smallest number of bits
179 it can and then increasing the bit width as it anticipates the LZW
180 code size growing to overflow.
181
182 This class knows all kinds of intimate things about how it's
183 upstream codepoint processors work; it knows the control codes
184 CLEAR_CODE and END_OF_INFO_CODE, and (more intimately still), it
185 makes assumptions about the rate of growth of it's consumer's
186 codebook. This is ok, as long as the underlying encoder/decoders
187 don't know any intimate details about their BitPackers/Unpackers
188 """
189
191 """
192 Takes an initial code book size (that is, the count of known
193 codes at the beginning of encoding, or after a clear)
194 """
195 self._initial_code_size = initial_code_size
196
197
198 - def pack(self, codepoints):
199 """
200 Given an iterator of integer codepoints, returns an iterator
201 over bytes containing the codepoints packed into varying
202 lengths, with bit width growing to accomodate an input code
203 that it assumes will grow by one entry per codepoint seen.
204
205 Widths will be reset to the given initial_code_size when the
206 LZW CLEAR_CODE or END_OF_INFO_CODE code appears in the input,
207 and bytes following END_OF_INFO_CODE will be aligned to the
208 next byte boundary.
209
210 >>> import lzw
211 >>> pkr = lzw.BitPacker(258)
212 >>> [ b for b in pkr.pack([ 1, 257]) ] == [ chr(0), chr(0xC0), chr(0x40) ]
213 True
214 """
215 tailbits = []
216 codesize = self._initial_code_size
217
218 minwidth = 8
219 while (1 << minwidth) < codesize:
220 minwidth = minwidth + 1
221
222 nextwidth = minwidth
223
224 for pt in codepoints:
225
226 newbits = inttobits(pt, nextwidth)
227 tailbits = tailbits + newbits
228
229
230
231
232 codesize = codesize + 1
233
234 if pt == END_OF_INFO_CODE:
235 while len(tailbits) % 8:
236 tailbits.append(0)
237
238 if pt in [ CLEAR_CODE, END_OF_INFO_CODE ]:
239 nextwidth = minwidth
240 codesize = self._initial_code_size
241 elif codesize >= (2 ** nextwidth):
242 nextwidth = nextwidth + 1
243
244 while len(tailbits) > 8:
245 nextbits = tailbits[:8]
246 nextbytes = bitstobytes(nextbits)
247 for bt in nextbytes:
248 yield struct.pack("B", bt)
249
250 tailbits = tailbits[8:]
251
252
253 if tailbits:
254 tail = bitstobytes(tailbits)
255 for bt in tail:
256 yield struct.pack("B", bt)
257
258
259
260
262 """
263 An adaptive-width bit unpacker, intended to decode streams written
264 by L{BitPacker} into integer codepoints. Like L{BitPacker}, knows
265 about code size changes and control codes.
266 """
267
269 """
270 initial_code_size is the starting size of the codebook
271 associated with the to-be-unpacked stream.
272 """
273 self._initial_code_size = initial_code_size
274
275
276 - def unpack(self, bytesource):
277 """
278 Given an iterator of bytes, returns an iterator of integer
279 code points. Auto-magically adjusts point width when it sees
280 an almost-overflow in the input stream, or an LZW CLEAR_CODE
281 or END_OF_INFO_CODE
282
283 Trailing bits at the end of the given iterator, after the last
284 codepoint, will be dropped on the floor.
285
286 At the end of the iteration, or when an END_OF_INFO_CODE seen
287 the unpacker will ignore the bits after the code until it
288 reaches the next aligned byte. END_OF_INFO_CODE will *not*
289 stop the generator, just reset the alignment and the width
290
291
292 >>> import lzw
293 >>> unpk = lzw.BitUnpacker(initial_code_size=258)
294 >>> [ i for i in unpk.unpack([ chr(0), chr(0xC0), chr(0x40) ]) ]
295 [1, 257]
296 """
297 bits = []
298 offset = 0
299 ignore = 0
300
301 codesize = self._initial_code_size
302 minwidth = 8
303 while (1 << minwidth) < codesize:
304 minwidth = minwidth + 1
305
306 pointwidth = minwidth
307
308 for nextbit in bytestobits(bytesource):
309
310 offset = (offset + 1) % 8
311 if ignore > 0:
312 ignore = ignore - 1
313 continue
314
315 bits.append(nextbit)
316
317 if len(bits) == pointwidth:
318 codepoint = intfrombits(bits)
319 bits = []
320
321 yield codepoint
322
323 codesize = codesize + 1
324
325 if codepoint in [ CLEAR_CODE, END_OF_INFO_CODE ]:
326 codesize = self._initial_code_size
327 pointwidth = minwidth
328 else:
329
330 while codesize >= (2 ** pointwidth):
331 pointwidth = pointwidth + 1
332
333 if codepoint == END_OF_INFO_CODE:
334 ignore = (8 - offset) % 8
335
336
337
339 """
340 Uncompresses a stream of lzw code points, as created by
341 L{Encoder}. Given a list of integer code points, with all
342 unpacking foolishness complete, turns that list of codepoints into
343 a list of uncompressed bytes. See L{BitUnpacker} for what this
344 doesn't do.
345 """
347 """
348 Creates a new Decoder. Decoders should not be reused for
349 different streams.
350 """
351 self._clear_codes()
352 self.remainder = []
353
354
356 """
357 Returns the current size of the Decoder's code book, that is,
358 it's mapping of codepoints to byte strings. The return value of
359 this method will change as the decode encounters more encoded
360 input, or control codes.
361 """
362 return len(self._codepoints)
363
364
365 - def decode(self, codepoints):
366 """
367 Given an iterable of integer codepoints, yields the
368 corresponding bytes, one at a time, as byte strings of length
369 E{1}. Retains the state of the codebook from call to call, so
370 if you have another stream, you'll likely need another
371 decoder!
372
373 Decoders will NOT handle END_OF_INFO_CODE (rather, they will
374 handle the code by throwing an exception); END_OF_INFO should
375 be handled by the upstream codepoint generator (see
376 L{BitUnpacker}, for example)
377
378 >>> import lzw
379 >>> dec = lzw.Decoder()
380 >>> ''.join(dec.decode([103, 97, 98, 98, 97, 32, 258, 260, 262, 121, 111, 263, 259, 261, 256]))
381 'gabba gabba yo gabba'
382
383 """
384 codepoints = [ cp for cp in codepoints ]
385
386 for cp in codepoints:
387 decoded = self._decode_codepoint(cp)
388 for character in decoded:
389 yield character
390
391
392
394 """
395 Will raise a ValueError if given an END_OF_INFORMATION
396 code. EOI codes should be handled by callers if they're
397 present in our source stream.
398
399 >>> import lzw
400 >>> dec = lzw.Decoder()
401 >>> beforesize = dec.code_size()
402 >>> dec._decode_codepoint(0x80)
403 '\\x80'
404 >>> dec._decode_codepoint(0x81)
405 '\\x81'
406 >>> beforesize + 1 == dec.code_size()
407 True
408 >>> dec._decode_codepoint(256)
409 ''
410 >>> beforesize == dec.code_size()
411 True
412 """
413
414 ret = b""
415
416 if codepoint == CLEAR_CODE:
417 self._clear_codes()
418 elif codepoint == END_OF_INFO_CODE:
419 raise ValueError("End of information code not supported directly by this Decoder")
420 else:
421 if codepoint in self._codepoints:
422 ret = self._codepoints[ codepoint ]
423 if None != self._prefix:
424 self._codepoints[ len(self._codepoints) ] = self._prefix + ret[0]
425
426 else:
427 ret = self._prefix + self._prefix[0]
428 self._codepoints[ len(self._codepoints) ] = ret
429
430 self._prefix = ret
431
432 return ret
433
434
440
441
443 """
444 Given an iterator of bytes, returns an iterator of integer
445 codepoints, suitable for use by L{Decoder}. The core of the
446 "compression" side of lzw compression/decompression.
447 """
449 """
450 When the encoding codebook grows larger than max_code_size,
451 the Encoder will clear its codebook and emit a CLEAR_CODE
452 """
453
454 self.closed = False
455
456 self._max_code_size = max_code_size
457 self._buffer = ''
458 self._clear_codes()
459
460 if max_code_size < self.code_size():
461 raise ValueError("Max code size too small, (must be at least {0})".format(self.code_size()))
462
463
465 """
466 Returns a count of the known codes, including codes that are
467 implicit in the data but have not yet been produced by the
468 iterator.
469 """
470 return len(self._prefixes)
471
472
474 """
475 Yields any buffered codepoints, followed by a CLEAR_CODE, and
476 clears the codebook as a side effect.
477 """
478
479 flushed = []
480
481 if self._buffer:
482 yield self._prefixes[ self._buffer ]
483 self._buffer = ''
484
485 yield CLEAR_CODE
486 self._clear_codes()
487
488
489
490
491 - def encode(self, bytesource):
492 """
493 Given an iterator over bytes, yields the
494 corresponding stream of codepoints.
495 Will clear the codes at the end of the stream.
496
497 >>> import lzw
498 >>> enc = lzw.Encoder()
499 >>> [ cp for cp in enc.encode("gabba gabba yo gabba") ]
500 [103, 97, 98, 98, 97, 32, 258, 260, 262, 121, 111, 263, 259, 261, 256]
501
502 """
503 for b in bytesource:
504 for point in self._encode_byte(b):
505 yield point
506
507 if self.code_size() >= self._max_code_size:
508 for pt in self.flush():
509 yield pt
510
511 for point in self.flush():
512 yield point
513
514
516
517
518
519
520
521
522 new_prefix = self._buffer
523
524 if new_prefix + byte in self._prefixes:
525 new_prefix = new_prefix + byte
526 elif new_prefix:
527 encoded = self._prefixes[ new_prefix ]
528 self._add_code(new_prefix + byte)
529 new_prefix = byte
530
531 yield encoded
532
533 self._buffer = new_prefix
534
535
536
537
546
547
549 self._prefixes[ newstring ] = len(self._prefixes)
550
551
552
554 """
555 UNTESTED. Handles encoding of multiple chunks or streams of encodable data,
556 separated with control codes. Dual of PagingDecoder.
557 """
558 - def __init__(self, initial_code_size, max_code_size):
559 self._initial_code_size = initial_code_size
560 self._max_code_size = max_code_size
561
562
563 - def encodepages(self, pages):
564 """
565 Given an iterator of iterators of bytes, produces a single
566 iterator containing a delimited sequence of independantly
567 compressed LZW sequences, all beginning on a byte-aligned
568 spot, all beginning with a CLEAR code and all terminated with
569 an END_OF_INFORMATION code (and zero to seven trailing junk
570 bits.)
571
572 The dual of PagingDecoder.decodepages
573
574 >>> import lzw
575 >>> enc = lzw.PagingEncoder(257, 2**12)
576 >>> coded = enc.encodepages([ "say hammer yo hammer mc hammer go hammer",
577 ... "and the rest can go and play",
578 ... "can't touch this" ])
579 ...
580 >>> b"".join(coded)
581 '\\x80\\x1c\\xcc\\'\\x91\\x01\\xa0\\xc2m6\\x99NB\\x03\\xc9\\xbe\\x0b\\x07\\x84\\xc2\\xcd\\xa68|"\\x14 3\\xc3\\xa0\\xd1c\\x94\\x02\\x02\\x80\\x18M\\xc6A\\x01\\xd0\\xd0e\\x10\\x1c\\x8c\\xa73\\xa0\\x80\\xc7\\x02\\x10\\x19\\xcd\\xe2\\x08\\x14\\x10\\xe0l0\\x9e`\\x10\\x10\\x80\\x18\\xcc&\\xe19\\xd0@t7\\x9dLf\\x889\\xa0\\xd2s\\x80@@'
582
583 """
584
585 for page in pages:
586
587 encoder = Encoder(max_code_size=self._max_code_size)
588 codepoints = encoder.encode(page)
589 codes_and_eoi = itertools.chain([ CLEAR_CODE ], codepoints, [ END_OF_INFO_CODE ])
590
591 packer = BitPacker(initial_code_size=encoder.code_size())
592 packed = packer.pack(codes_and_eoi)
593
594 for byte in packed:
595 yield byte
596
597
598
599
601 """
602 UNTESTED. Dual of PagingEncoder, knows how to handle independantly encoded,
603 END_OF_INFO_CODE delimited chunks of an inbound byte stream
604 """
605
607 self._initial_code_size = initial_code_size
608 self._remains = []
609
610 - def next_page(self, codepoints):
611 """
612 Iterator over the next page of codepoints.
613 """
614 self._remains = []
615
616 try:
617 while 1:
618 cp = codepoints.next()
619 if cp != END_OF_INFO_CODE:
620 yield cp
621 else:
622 self._remains = codepoints
623 break
624
625 except StopIteration:
626 pass
627
628
629 - def decodepages(self, bytesource):
630 """
631 Takes an iterator of bytes, returns an iterator of iterators
632 of uncompressed data. Expects input to conform to the output
633 conventions of PagingEncoder(), in particular that "pages" are
634 separated with an END_OF_INFO_CODE and padding up to the next
635 byte boundary.
636
637 BUG: Dangling trailing page on decompression.
638
639 >>> import lzw
640 >>> pgdec = lzw.PagingDecoder(initial_code_size=257)
641 >>> pgdecoded = pgdec.decodepages(
642 ... ''.join([ '\\x80\\x1c\\xcc\\'\\x91\\x01\\xa0\\xc2m6',
643 ... '\\x99NB\\x03\\xc9\\xbe\\x0b\\x07\\x84\\xc2',
644 ... '\\xcd\\xa68|"\\x14 3\\xc3\\xa0\\xd1c\\x94',
645 ... '\\x02\\x02\\x80\\x18M\\xc6A\\x01\\xd0\\xd0e',
646 ... '\\x10\\x1c\\x8c\\xa73\\xa0\\x80\\xc7\\x02\\x10',
647 ... '\\x19\\xcd\\xe2\\x08\\x14\\x10\\xe0l0\\x9e`\\x10',
648 ... '\\x10\\x80\\x18\\xcc&\\xe19\\xd0@t7\\x9dLf\\x889',
649 ... '\\xa0\\xd2s\\x80@@' ])
650 ... )
651 >>> [ b"".join(pg) for pg in pgdecoded ]
652 ['say hammer yo hammer mc hammer go hammer', 'and the rest can go and play', "can't touch this", '']
653
654 """
655
656
657
658
659
660
661 unpacker = BitUnpacker(initial_code_size=self._initial_code_size)
662 codepoints = unpacker.unpack(bytesource)
663
664 self._remains = codepoints
665 while self._remains:
666 nextpoints = self.next_page(self._remains)
667 nextpoints = [ nx for nx in nextpoints ]
668
669 decoder = Decoder()
670 decoded = decoder.decode(nextpoints)
671 decoded = [ dec for dec in decoded ]
672
673 yield decoded
674
675
676
677
678
679
680
681
683 """
684 Given a one-byte long byte string, returns an integer. Equivalent
685 to struct.unpack("B", b)
686 """
687 (ret,) = struct.unpack("B", b)
688 return ret
689
690
691
692
693
694
696 """
697 Convenience for iterating over the bytes in a file. Given a
698 file-like object (with a read(int) method), returns an iterator
699 over the bytes of that file.
700 """
701 buff = fileobj.read(buffersize)
702 while buff:
703 for byte in buff: yield byte
704 buff = fileobj.read(buffersize)
705
706
708 """
709 Opens a file named by filename and iterates over the L{filebytes}
710 found therein. Will close the file when the bytes run out.
711 """
712 with open(filename, "rb") as infile:
713 for byte in filebytes(infile, buffersize):
714 yield byte
715
716
717
719 """
720 Convenience for emitting the bytes we generate to a file. Given a
721 filename, opens and truncates the file, dumps the bytes
722 from bytesource into it, and closes it
723 """
724
725 with open(filename, "wb") as outfile:
726 for bt in bytesource:
727 outfile.write(bt)
728
729
731 """
732 Produces an array of booleans representing the given argument as
733 an unsigned integer, MSB first. If width is given, will pad the
734 MSBs to the given width (but will NOT truncate overflowing
735 results)
736
737 >>> import lzw
738 >>> lzw.inttobits(304, width=16)
739 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]
740
741 """
742 remains = anint
743 retreverse = []
744 while remains:
745 retreverse.append(remains & 1)
746 remains = remains >> 1
747
748 retreverse.reverse()
749
750 ret = retreverse
751 if None != width:
752 ret_head = [ 0 ] * (width - len(ret))
753 ret = ret_head + ret
754
755 return ret
756
757
759 """
760 Given a list of boolean values, interprets them as a binary
761 encoded, MSB-first unsigned integer (with True == 1 and False
762 == 0) and returns the result.
763
764 >>> import lzw
765 >>> lzw.intfrombits([ 1, 0, 0, 1, 1, 0, 0, 0, 0 ])
766 304
767 """
768 ret = 0
769 lsb_first = [ b for b in bits ]
770 lsb_first.reverse()
771
772 for bit_index in range(len(lsb_first)):
773 if lsb_first[ bit_index ]:
774 ret = ret | (1 << bit_index)
775
776 return ret
777
778
780 """
781 Breaks a given iterable of bytes into an iterable of boolean
782 values representing those bytes as unsigned integers.
783
784 >>> import lzw
785 >>> [ x for x in lzw.bytestobits(b"\\x01\\x30") ]
786 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]
787 """
788 for b in bytesource:
789
790 value = unpackbyte(b)
791
792 for bitplusone in range(8, 0, -1):
793 bitindex = bitplusone - 1
794 nextbit = 1 & (value >> bitindex)
795 yield nextbit
796
797
799 """
800 Interprets an indexable list of booleans as bits, MSB first, to be
801 packed into a list of integers from 0 to 256, MSB first, with LSBs
802 zero-padded. Note this padding behavior means that round-trips of
803 bytestobits(bitstobytes(x, width=W)) may not yield what you expect
804 them to if W % 8 != 0
805
806 Does *NOT* pack the returned values into a bytearray or the like.
807
808 >>> import lzw
809 >>> bitstobytes([0, 0, 0, 0, 0, 0, 0, 0, "Yes, I'm True"]) == [ 0x00, 0x80 ]
810 True
811 >>> bitstobytes([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]) == [ 0x01, 0x30 ]
812 True
813 """
814 ret = []
815 nextbyte = 0
816 nextbit = 7
817 for bit in bits:
818 if bit:
819 nextbyte = nextbyte | (1 << nextbit)
820
821 if nextbit:
822 nextbit = nextbit - 1
823 else:
824 ret.append(nextbyte)
825 nextbit = 7
826 nextbyte = 0
827
828 if nextbit < 7: ret.append(nextbyte)
829 return ret
830