Package lzw
[frames] | no frames]

Source Code for Package lzw

  1   
  2  """ 
  3  A stream friendly, simple compression library, built around 
  4  iterators. See L{compress} and L{decompress} for the easiest way to 
  5  get started. 
  6   
  7  After the TIFF implementation of LZW, as described at 
  8  U{http://www.fileformat.info/format/tiff/corion-lzw.htm} 
  9   
 10   
 11  In an even-nuttier-shell, lzw compresses input bytes with integer 
 12  codes. Starting with codes 0-255 that code to themselves, and two 
 13  control codes, we work our way through a stream of bytes. When we 
 14  encounter a pair of codes c1,c2 we add another entry to our code table 
 15  with the lowest available code and the value value(c1) + value(c2)[0] 
 16   
 17  Of course, there are details :) 
 18   
 19  The Details 
 20  =========== 
 21   
 22      Our control codes are 
 23   
 24          - CLEAR_CODE (codepoint 256). When this code is encountered, we flush 
 25            the codebook and start over. 
 26          - END_OF_INFO_CODE (codepoint 257). This code is reserved for 
 27            encoder/decoders over the integer codepoint stream (like the 
 28            mechanical bit that unpacks bits into codepoints) 
 29   
 30      When dealing with bytes, codes are emitted as variable 
 31      length bit strings packed into the stream of bytes. 
 32   
 33      codepoints are written with varying length 
 34          - initially 9 bits 
 35          - at 512 entries 10 bits 
 36          - at 1025 entries at 11 bits 
 37          - at 2048 entries 12 bits 
 38          - with max of 4095 entries in a table (including Clear and EOI) 
 39   
 40      code points are stored with their MSB in the most significant bit 
 41      available in the output character. 
 42   
 43  >>> import lzw 
 44  >>> 
 45  >>> mybytes = lzw.readbytes("README.txt") 
 46  >>> lessbytes = lzw.compress(mybytes) 
 47  >>> newbytes = b"".join(lzw.decompress(lessbytes)) 
 48  >>> oldbytes = b"".join(lzw.readbytes("README.txt")) 
 49  >>> oldbytes == newbytes 
 50  True 
 51   
 52   
 53  """ 
 54   
 55  __author__ = "Joe Bowers" 
 56  __license__ = "MIT License" 
 57  __version__ = "0.01" 
 58  __status__ = "Development" 
 59  __email__ = "joerbowers@gmail.com" 
 60  __url__ = "http://www.joe-bowers.com/static/lzw" 
 61   
 62  import struct 
 63  import itertools 
 64   
 65   
 66  CLEAR_CODE = 256 
 67  END_OF_INFO_CODE = 257 
 68   
 69  DEFAULT_MIN_BITS = 9 
 70  DEFAULT_MAX_BITS = 12 
 71   
 72   
 73   
 74   
75 -def compress(plaintext_bytes):
76 """ 77 Given an iterable of bytes, returns a (hopefully shorter) iterable 78 of bytes that you can store in a file or pass over the network or 79 what-have-you, and later use to get back your original bytes with 80 L{decompress}. This is the best place to start using this module. 81 """ 82 encoder = ByteEncoder() 83 return encoder.encodetobytes(plaintext_bytes)
84 85
86 -def decompress(compressed_bytes):
87 """ 88 Given an iterable of bytes that were the result of a call to 89 L{compress}, returns an iterator over the uncompressed bytes. 90 """ 91 decoder = ByteDecoder() 92 return decoder.decodefrombytes(compressed_bytes)
93 94 95 96 97
98 -class ByteEncoder(object):
99 """ 100 Takes a stream of uncompressed bytes and produces a stream of 101 compressed bytes, usable by L{ByteDecoder}. Combines an L{Encoder} 102 with a L{BitPacker}. 103 104 105 >>> import lzw 106 >>> 107 >>> enc = lzw.ByteEncoder(12) 108 >>> bigstr = b"gabba gabba yo gabba gabba gabba yo gabba gabba gabba yo gabba gabba gabba yo" 109 >>> encoding = enc.encodetobytes(bigstr) 110 >>> encoded = b"".join( b for b in encoding ) 111 >>> encoded 112 '3\\x98LF#\\x08\\x82\\x05\\x04\\x83\\x1eM\\xf0x\\x1c\\x16\\x1b\\t\\x88C\\xe1q(4"\\x1f\\x17\\x85C#1X\\xec.\\x00' 113 >>> 114 >>> dec = lzw.ByteDecoder() 115 >>> decoding = dec.decodefrombytes(encoded) 116 >>> decoded = b"".join(decoding) 117 >>> decoded == bigstr 118 True 119 120 """ 121
122 - def __init__(self, max_width=DEFAULT_MAX_BITS):
123 """ 124 max_width is the maximum width in bits we want to see in the 125 output stream of codepoints. 126 """ 127 self._encoder = Encoder(max_code_size=2**max_width) 128 self._packer = BitPacker(initial_code_size=self._encoder.code_size())
129 130
131 - def encodetobytes(self, bytesource):
132 """ 133 Returns an iterator of bytes, adjusting our packed width 134 between minwidth and maxwidth when it detects an overflow is 135 about to occur. Dual of L{ByteDecoder.decodefrombytes}. 136 """ 137 codepoints = self._encoder.encode(bytesource) 138 codebytes = self._packer.pack(codepoints) 139 140 return codebytes
141 142
143 -class ByteDecoder(object):
144 """ 145 Decodes, combines bit-unpacking and interpreting a codepoint 146 stream, suitable for use with bytes generated by 147 L{ByteEncoder}. 148 149 See L{ByteDecoder} for a usage example. 150 """
151 - def __init__(self):
152 self._decoder = Decoder() 153 self._unpacker = BitUnpacker(initial_code_size=self._decoder.code_size()) 154 self.remaining = []
155
156 - def decodefrombytes(self, bytesource):
157 """ 158 Given an iterator over BitPacked, Encoded bytes, Returns an 159 iterator over the uncompressed bytes. Dual of 160 L{ByteEncoder.encodetobytes}. See L{ByteEncoder} for an 161 example of use. 162 """ 163 codepoints = self._unpacker.unpack(bytesource) 164 clearbytes = self._decoder.decode(codepoints) 165 166 return clearbytes
167 168
169 -class BitPacker(object):
170 """ 171 Translates a stream of lzw codepoints into a variable width packed 172 stream of bytes, for use by L{BitUnpacker}. One of a (potential) 173 set of encoders for a stream of LZW codepoints, intended to behave 174 as closely to the TIFF variable-width encoding scheme as closely 175 as possible. 176 177 The inbound stream of integer lzw codepoints are packed into 178 variable width bit fields, starting at the smallest number of bits 179 it can and then increasing the bit width as it anticipates the LZW 180 code size growing to overflow. 181 182 This class knows all kinds of intimate things about how it's 183 upstream codepoint processors work; it knows the control codes 184 CLEAR_CODE and END_OF_INFO_CODE, and (more intimately still), it 185 makes assumptions about the rate of growth of it's consumer's 186 codebook. This is ok, as long as the underlying encoder/decoders 187 don't know any intimate details about their BitPackers/Unpackers 188 """ 189
190 - def __init__(self, initial_code_size):
191 """ 192 Takes an initial code book size (that is, the count of known 193 codes at the beginning of encoding, or after a clear) 194 """ 195 self._initial_code_size = initial_code_size
196 197
198 - def pack(self, codepoints):
199 """ 200 Given an iterator of integer codepoints, returns an iterator 201 over bytes containing the codepoints packed into varying 202 lengths, with bit width growing to accomodate an input code 203 that it assumes will grow by one entry per codepoint seen. 204 205 Widths will be reset to the given initial_code_size when the 206 LZW CLEAR_CODE or END_OF_INFO_CODE code appears in the input, 207 and bytes following END_OF_INFO_CODE will be aligned to the 208 next byte boundary. 209 210 >>> import lzw 211 >>> pkr = lzw.BitPacker(258) 212 >>> [ b for b in pkr.pack([ 1, 257]) ] == [ chr(0), chr(0xC0), chr(0x40) ] 213 True 214 """ 215 tailbits = [] 216 codesize = self._initial_code_size 217 218 minwidth = 8 219 while (1 << minwidth) < codesize: 220 minwidth = minwidth + 1 221 222 nextwidth = minwidth 223 224 for pt in codepoints: 225 226 newbits = inttobits(pt, nextwidth) 227 tailbits = tailbits + newbits 228 229 # PAY ATTENTION. This calculation should be driven by the 230 # size of the upstream codebook, right now we're just trusting 231 # that everybody intends to follow the TIFF spec. 232 codesize = codesize + 1 233 234 if pt == END_OF_INFO_CODE: 235 while len(tailbits) % 8: 236 tailbits.append(0) 237 238 if pt in [ CLEAR_CODE, END_OF_INFO_CODE ]: 239 nextwidth = minwidth 240 codesize = self._initial_code_size 241 elif codesize >= (2 ** nextwidth): 242 nextwidth = nextwidth + 1 243 244 while len(tailbits) > 8: 245 nextbits = tailbits[:8] 246 nextbytes = bitstobytes(nextbits) 247 for bt in nextbytes: 248 yield struct.pack("B", bt) 249 250 tailbits = tailbits[8:] 251 252 253 if tailbits: 254 tail = bitstobytes(tailbits) 255 for bt in tail: 256 yield struct.pack("B", bt)
257 258 259 260
261 -class BitUnpacker(object):
262 """ 263 An adaptive-width bit unpacker, intended to decode streams written 264 by L{BitPacker} into integer codepoints. Like L{BitPacker}, knows 265 about code size changes and control codes. 266 """ 267
268 - def __init__(self, initial_code_size):
269 """ 270 initial_code_size is the starting size of the codebook 271 associated with the to-be-unpacked stream. 272 """ 273 self._initial_code_size = initial_code_size
274 275
276 - def unpack(self, bytesource):
277 """ 278 Given an iterator of bytes, returns an iterator of integer 279 code points. Auto-magically adjusts point width when it sees 280 an almost-overflow in the input stream, or an LZW CLEAR_CODE 281 or END_OF_INFO_CODE 282 283 Trailing bits at the end of the given iterator, after the last 284 codepoint, will be dropped on the floor. 285 286 At the end of the iteration, or when an END_OF_INFO_CODE seen 287 the unpacker will ignore the bits after the code until it 288 reaches the next aligned byte. END_OF_INFO_CODE will *not* 289 stop the generator, just reset the alignment and the width 290 291 292 >>> import lzw 293 >>> unpk = lzw.BitUnpacker(initial_code_size=258) 294 >>> [ i for i in unpk.unpack([ chr(0), chr(0xC0), chr(0x40) ]) ] 295 [1, 257] 296 """ 297 bits = [] 298 offset = 0 299 ignore = 0 300 301 codesize = self._initial_code_size 302 minwidth = 8 303 while (1 << minwidth) < codesize: 304 minwidth = minwidth + 1 305 306 pointwidth = minwidth 307 308 for nextbit in bytestobits(bytesource): 309 310 offset = (offset + 1) % 8 311 if ignore > 0: 312 ignore = ignore - 1 313 continue 314 315 bits.append(nextbit) 316 317 if len(bits) == pointwidth: 318 codepoint = intfrombits(bits) 319 bits = [] 320 321 yield codepoint 322 323 codesize = codesize + 1 324 325 if codepoint in [ CLEAR_CODE, END_OF_INFO_CODE ]: 326 codesize = self._initial_code_size 327 pointwidth = minwidth 328 else: 329 # is this too late? 330 while codesize >= (2 ** pointwidth): 331 pointwidth = pointwidth + 1 332 333 if codepoint == END_OF_INFO_CODE: 334 ignore = (8 - offset) % 8
335 336 337
338 -class Decoder(object):
339 """ 340 Uncompresses a stream of lzw code points, as created by 341 L{Encoder}. Given a list of integer code points, with all 342 unpacking foolishness complete, turns that list of codepoints into 343 a list of uncompressed bytes. See L{BitUnpacker} for what this 344 doesn't do. 345 """
346 - def __init__(self):
347 """ 348 Creates a new Decoder. Decoders should not be reused for 349 different streams. 350 """ 351 self._clear_codes() 352 self.remainder = []
353 354
355 - def code_size(self):
356 """ 357 Returns the current size of the Decoder's code book, that is, 358 it's mapping of codepoints to byte strings. The return value of 359 this method will change as the decode encounters more encoded 360 input, or control codes. 361 """ 362 return len(self._codepoints)
363 364
365 - def decode(self, codepoints):
366 """ 367 Given an iterable of integer codepoints, yields the 368 corresponding bytes, one at a time, as byte strings of length 369 E{1}. Retains the state of the codebook from call to call, so 370 if you have another stream, you'll likely need another 371 decoder! 372 373 Decoders will NOT handle END_OF_INFO_CODE (rather, they will 374 handle the code by throwing an exception); END_OF_INFO should 375 be handled by the upstream codepoint generator (see 376 L{BitUnpacker}, for example) 377 378 >>> import lzw 379 >>> dec = lzw.Decoder() 380 >>> ''.join(dec.decode([103, 97, 98, 98, 97, 32, 258, 260, 262, 121, 111, 263, 259, 261, 256])) 381 'gabba gabba yo gabba' 382 383 """ 384 codepoints = [ cp for cp in codepoints ] 385 386 for cp in codepoints: 387 decoded = self._decode_codepoint(cp) 388 for character in decoded: 389 yield character
390 391 392
393 - def _decode_codepoint(self, codepoint):
394 """ 395 Will raise a ValueError if given an END_OF_INFORMATION 396 code. EOI codes should be handled by callers if they're 397 present in our source stream. 398 399 >>> import lzw 400 >>> dec = lzw.Decoder() 401 >>> beforesize = dec.code_size() 402 >>> dec._decode_codepoint(0x80) 403 '\\x80' 404 >>> dec._decode_codepoint(0x81) 405 '\\x81' 406 >>> beforesize + 1 == dec.code_size() 407 True 408 >>> dec._decode_codepoint(256) 409 '' 410 >>> beforesize == dec.code_size() 411 True 412 """ 413 414 ret = b"" 415 416 if codepoint == CLEAR_CODE: 417 self._clear_codes() 418 elif codepoint == END_OF_INFO_CODE: 419 raise ValueError("End of information code not supported directly by this Decoder") 420 else: 421 if codepoint in self._codepoints: 422 ret = self._codepoints[ codepoint ] 423 if None != self._prefix: 424 self._codepoints[ len(self._codepoints) ] = self._prefix + ret[0] 425 426 else: 427 ret = self._prefix + self._prefix[0] 428 self._codepoints[ len(self._codepoints) ] = ret 429 430 self._prefix = ret 431 432 return ret
433 434
435 - def _clear_codes(self):
436 self._codepoints = dict( (pt, struct.pack("B", pt)) for pt in range(256) ) 437 self._codepoints[CLEAR_CODE] = CLEAR_CODE 438 self._codepoints[END_OF_INFO_CODE] = END_OF_INFO_CODE 439 self._prefix = None
440 441
442 -class Encoder(object):
443 """ 444 Given an iterator of bytes, returns an iterator of integer 445 codepoints, suitable for use by L{Decoder}. The core of the 446 "compression" side of lzw compression/decompression. 447 """
448 - def __init__(self, max_code_size=(2**DEFAULT_MAX_BITS)):
449 """ 450 When the encoding codebook grows larger than max_code_size, 451 the Encoder will clear its codebook and emit a CLEAR_CODE 452 """ 453 454 self.closed = False 455 456 self._max_code_size = max_code_size 457 self._buffer = '' 458 self._clear_codes() 459 460 if max_code_size < self.code_size(): 461 raise ValueError("Max code size too small, (must be at least {0})".format(self.code_size()))
462 463
464 - def code_size(self):
465 """ 466 Returns a count of the known codes, including codes that are 467 implicit in the data but have not yet been produced by the 468 iterator. 469 """ 470 return len(self._prefixes)
471 472
473 - def flush(self):
474 """ 475 Yields any buffered codepoints, followed by a CLEAR_CODE, and 476 clears the codebook as a side effect. 477 """ 478 479 flushed = [] 480 481 if self._buffer: 482 yield self._prefixes[ self._buffer ] 483 self._buffer = '' 484 485 yield CLEAR_CODE 486 self._clear_codes()
487 488 489 490
491 - def encode(self, bytesource):
492 """ 493 Given an iterator over bytes, yields the 494 corresponding stream of codepoints. 495 Will clear the codes at the end of the stream. 496 497 >>> import lzw 498 >>> enc = lzw.Encoder() 499 >>> [ cp for cp in enc.encode("gabba gabba yo gabba") ] 500 [103, 97, 98, 98, 97, 32, 258, 260, 262, 121, 111, 263, 259, 261, 256] 501 502 """ 503 for b in bytesource: 504 for point in self._encode_byte(b): 505 yield point 506 507 if self.code_size() >= self._max_code_size: 508 for pt in self.flush(): 509 yield pt 510 511 for point in self.flush(): 512 yield point
513 514
515 - def _encode_byte(self, byte):
516 # Yields one or zero bytes, AND changes the internal state of 517 # the codebook and prefix buffer. 518 # 519 # Unless you're in self.encode(), you almost certainly don't 520 # want to call this. 521 522 new_prefix = self._buffer 523 524 if new_prefix + byte in self._prefixes: 525 new_prefix = new_prefix + byte 526 elif new_prefix: 527 encoded = self._prefixes[ new_prefix ] 528 self._add_code(new_prefix + byte) 529 new_prefix = byte 530 531 yield encoded 532 533 self._buffer = new_prefix
534 535 536 537
538 - def _clear_codes(self):
539 540 # Teensy hack, CLEAR_CODE and END_OF_INFO_CODE aren't 541 # equal to any possible string. 542 543 self._prefixes = dict( (struct.pack("B", codept), codept) for codept in range(256) ) 544 self._prefixes[ CLEAR_CODE ] = CLEAR_CODE 545 self._prefixes[ END_OF_INFO_CODE ] = END_OF_INFO_CODE
546 547
548 - def _add_code(self, newstring):
549 self._prefixes[ newstring ] = len(self._prefixes)
550 551 552
553 -class PagingEncoder(object):
554 """ 555 UNTESTED. Handles encoding of multiple chunks or streams of encodable data, 556 separated with control codes. Dual of PagingDecoder. 557 """
558 - def __init__(self, initial_code_size, max_code_size):
559 self._initial_code_size = initial_code_size 560 self._max_code_size = max_code_size
561 562
563 - def encodepages(self, pages):
564 """ 565 Given an iterator of iterators of bytes, produces a single 566 iterator containing a delimited sequence of independantly 567 compressed LZW sequences, all beginning on a byte-aligned 568 spot, all beginning with a CLEAR code and all terminated with 569 an END_OF_INFORMATION code (and zero to seven trailing junk 570 bits.) 571 572 The dual of PagingDecoder.decodepages 573 574 >>> import lzw 575 >>> enc = lzw.PagingEncoder(257, 2**12) 576 >>> coded = enc.encodepages([ "say hammer yo hammer mc hammer go hammer", 577 ... "and the rest can go and play", 578 ... "can't touch this" ]) 579 ... 580 >>> b"".join(coded) 581 '\\x80\\x1c\\xcc\\'\\x91\\x01\\xa0\\xc2m6\\x99NB\\x03\\xc9\\xbe\\x0b\\x07\\x84\\xc2\\xcd\\xa68|"\\x14 3\\xc3\\xa0\\xd1c\\x94\\x02\\x02\\x80\\x18M\\xc6A\\x01\\xd0\\xd0e\\x10\\x1c\\x8c\\xa73\\xa0\\x80\\xc7\\x02\\x10\\x19\\xcd\\xe2\\x08\\x14\\x10\\xe0l0\\x9e`\\x10\\x10\\x80\\x18\\xcc&\\xe19\\xd0@t7\\x9dLf\\x889\\xa0\\xd2s\\x80@@' 582 583 """ 584 585 for page in pages: 586 587 encoder = Encoder(max_code_size=self._max_code_size) 588 codepoints = encoder.encode(page) 589 codes_and_eoi = itertools.chain([ CLEAR_CODE ], codepoints, [ END_OF_INFO_CODE ]) 590 591 packer = BitPacker(initial_code_size=encoder.code_size()) 592 packed = packer.pack(codes_and_eoi) 593 594 for byte in packed: 595 yield byte
596 597 598 599
600 -class PagingDecoder(object):
601 """ 602 UNTESTED. Dual of PagingEncoder, knows how to handle independantly encoded, 603 END_OF_INFO_CODE delimited chunks of an inbound byte stream 604 """ 605
606 - def __init__(self, initial_code_size):
607 self._initial_code_size = initial_code_size 608 self._remains = []
609
610 - def next_page(self, codepoints):
611 """ 612 Iterator over the next page of codepoints. 613 """ 614 self._remains = [] 615 616 try: 617 while 1: 618 cp = codepoints.next() 619 if cp != END_OF_INFO_CODE: 620 yield cp 621 else: 622 self._remains = codepoints 623 break 624 625 except StopIteration: 626 pass
627 628
629 - def decodepages(self, bytesource):
630 """ 631 Takes an iterator of bytes, returns an iterator of iterators 632 of uncompressed data. Expects input to conform to the output 633 conventions of PagingEncoder(), in particular that "pages" are 634 separated with an END_OF_INFO_CODE and padding up to the next 635 byte boundary. 636 637 BUG: Dangling trailing page on decompression. 638 639 >>> import lzw 640 >>> pgdec = lzw.PagingDecoder(initial_code_size=257) 641 >>> pgdecoded = pgdec.decodepages( 642 ... ''.join([ '\\x80\\x1c\\xcc\\'\\x91\\x01\\xa0\\xc2m6', 643 ... '\\x99NB\\x03\\xc9\\xbe\\x0b\\x07\\x84\\xc2', 644 ... '\\xcd\\xa68|"\\x14 3\\xc3\\xa0\\xd1c\\x94', 645 ... '\\x02\\x02\\x80\\x18M\\xc6A\\x01\\xd0\\xd0e', 646 ... '\\x10\\x1c\\x8c\\xa73\\xa0\\x80\\xc7\\x02\\x10', 647 ... '\\x19\\xcd\\xe2\\x08\\x14\\x10\\xe0l0\\x9e`\\x10', 648 ... '\\x10\\x80\\x18\\xcc&\\xe19\\xd0@t7\\x9dLf\\x889', 649 ... '\\xa0\\xd2s\\x80@@' ]) 650 ... ) 651 >>> [ b"".join(pg) for pg in pgdecoded ] 652 ['say hammer yo hammer mc hammer go hammer', 'and the rest can go and play', "can't touch this", ''] 653 654 """ 655 656 # TODO: WE NEED A CODE SIZE POLICY OBJECT THAT ISN'T THIS. 657 # honestly, we should have a "codebook" object we need to pass 658 # to bit packing/unpacking tools, etc, such that we don't have 659 # to roll all of these code size assumptions everyplace. 660 661 unpacker = BitUnpacker(initial_code_size=self._initial_code_size) 662 codepoints = unpacker.unpack(bytesource) 663 664 self._remains = codepoints 665 while self._remains: 666 nextpoints = self.next_page(self._remains) 667 nextpoints = [ nx for nx in nextpoints ] 668 669 decoder = Decoder() 670 decoded = decoder.decode(nextpoints) 671 decoded = [ dec for dec in decoded ] 672 673 yield decoded
674 675 676 677 ######################################### 678 # Conveniences. 679 680 681 # PYTHON V2
682 -def unpackbyte(b):
683 """ 684 Given a one-byte long byte string, returns an integer. Equivalent 685 to struct.unpack("B", b) 686 """ 687 (ret,) = struct.unpack("B", b) 688 return ret
689 690 691 # PYTHON V3 692 # def unpackbyte(b): return b 693 694
695 -def filebytes(fileobj, buffersize=1024):
696 """ 697 Convenience for iterating over the bytes in a file. Given a 698 file-like object (with a read(int) method), returns an iterator 699 over the bytes of that file. 700 """ 701 buff = fileobj.read(buffersize) 702 while buff: 703 for byte in buff: yield byte 704 buff = fileobj.read(buffersize)
705 706
707 -def readbytes(filename, buffersize=1024):
708 """ 709 Opens a file named by filename and iterates over the L{filebytes} 710 found therein. Will close the file when the bytes run out. 711 """ 712 with open(filename, "rb") as infile: 713 for byte in filebytes(infile, buffersize): 714 yield byte
715 716 717
718 -def writebytes(filename, bytesource):
719 """ 720 Convenience for emitting the bytes we generate to a file. Given a 721 filename, opens and truncates the file, dumps the bytes 722 from bytesource into it, and closes it 723 """ 724 725 with open(filename, "wb") as outfile: 726 for bt in bytesource: 727 outfile.write(bt)
728 729
730 -def inttobits(anint, width=None):
731 """ 732 Produces an array of booleans representing the given argument as 733 an unsigned integer, MSB first. If width is given, will pad the 734 MSBs to the given width (but will NOT truncate overflowing 735 results) 736 737 >>> import lzw 738 >>> lzw.inttobits(304, width=16) 739 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0] 740 741 """ 742 remains = anint 743 retreverse = [] 744 while remains: 745 retreverse.append(remains & 1) 746 remains = remains >> 1 747 748 retreverse.reverse() 749 750 ret = retreverse 751 if None != width: 752 ret_head = [ 0 ] * (width - len(ret)) 753 ret = ret_head + ret 754 755 return ret
756 757
758 -def intfrombits(bits):
759 """ 760 Given a list of boolean values, interprets them as a binary 761 encoded, MSB-first unsigned integer (with True == 1 and False 762 == 0) and returns the result. 763 764 >>> import lzw 765 >>> lzw.intfrombits([ 1, 0, 0, 1, 1, 0, 0, 0, 0 ]) 766 304 767 """ 768 ret = 0 769 lsb_first = [ b for b in bits ] 770 lsb_first.reverse() 771 772 for bit_index in range(len(lsb_first)): 773 if lsb_first[ bit_index ]: 774 ret = ret | (1 << bit_index) 775 776 return ret
777 778
779 -def bytestobits(bytesource):
780 """ 781 Breaks a given iterable of bytes into an iterable of boolean 782 values representing those bytes as unsigned integers. 783 784 >>> import lzw 785 >>> [ x for x in lzw.bytestobits(b"\\x01\\x30") ] 786 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0] 787 """ 788 for b in bytesource: 789 790 value = unpackbyte(b) 791 792 for bitplusone in range(8, 0, -1): 793 bitindex = bitplusone - 1 794 nextbit = 1 & (value >> bitindex) 795 yield nextbit
796 797
798 -def bitstobytes(bits):
799 """ 800 Interprets an indexable list of booleans as bits, MSB first, to be 801 packed into a list of integers from 0 to 256, MSB first, with LSBs 802 zero-padded. Note this padding behavior means that round-trips of 803 bytestobits(bitstobytes(x, width=W)) may not yield what you expect 804 them to if W % 8 != 0 805 806 Does *NOT* pack the returned values into a bytearray or the like. 807 808 >>> import lzw 809 >>> bitstobytes([0, 0, 0, 0, 0, 0, 0, 0, "Yes, I'm True"]) == [ 0x00, 0x80 ] 810 True 811 >>> bitstobytes([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]) == [ 0x01, 0x30 ] 812 True 813 """ 814 ret = [] 815 nextbyte = 0 816 nextbit = 7 817 for bit in bits: 818 if bit: 819 nextbyte = nextbyte | (1 << nextbit) 820 821 if nextbit: 822 nextbit = nextbit - 1 823 else: 824 ret.append(nextbyte) 825 nextbit = 7 826 nextbyte = 0 827 828 if nextbit < 7: ret.append(nextbyte) 829 return ret
830