Package checkm :: Module checkm
[hide private]
[frames] | no frames]

Source Code for Module checkm.checkm

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Checksumming convenience classes 
  5   
  6  TODO! Sorry! 
  7   
  8                  [@]SourceFileOrURL  Alg     Digest  Length   ModTime   TargetFileOrURL 
  9  TOKEN NUMBER:    1                  2       3       4        5         6 
 10   
 11  """ 
 12   
 13  from __future__ import with_statement 
 14   
 15  COLUMNS = { 0:"SourceFileOrURL", 
 16              1:"Alg", 
 17              2:"Digest", 
 18              3:"Length", 
 19              4:"ModTime", 
 20              5:"TargetFileOrURL", 
 21              } 
 22   
 23   
 24  import os, sys 
 25  from stat import * 
 26   
 27  import re 
 28   
 29  from collections import defaultdict 
 30   
 31  import hashlib 
 32   
 33  import codecs 
 34   
 35  import logging 
 36   
 37  logging.basicConfig(level=logging.INFO) 
 38   
 39  logger = logging.getLogger('checkm') 
 40   
41 -class NotFound(Exception):
42 """The item or directory was either not found, or not accessible."""
43 - def __init__(self, *arg, **kw):
44 """ 45 FIXME 46 @param *arg: 47 @type *arg: 48 @param **kw: 49 @type **kw: 50 """ 51 self.context = (arg, kw)
52 - def __repr__(self):
53 """ 54 FIXME 55 """ 56 return self.context.__str__()
57 - def __str__(self):
58 """ 59 FIXME 60 """ 61 return self.context.__str__()
62
63 -class CheckmReporter(object):
64 COLUMN_NAMES = [u'# [@]SourceFileOrURL',u'Alg',u'Digest',u'Length',u'ModTime']
65 - def __init__(self):
66 """ 67 FIXME 68 """ 69 self.scanner = CheckmScanner()
70
71 - def _get_max_len(self, report):
72 """ 73 FIXME 74 @param report: 75 @type report: 76 """ 77 cols = defaultdict(lambda : 0) 78 for line in report: 79 for index in xrange(len(line)): 80 if len(line[index])>cols[index]: 81 cols[index] = len(line[index]) 82 return cols
83
84 - def _space_line(self, line, col_maxes):
85 """ 86 FIXME 87 @param line: 88 @type line: 89 @param col_maxes: 90 @type col_maxes: 91 """ 92 spaced_line = [] 93 for index in xrange(len(line)): 94 spaced_line.append(line[index]) 95 spaces = col_maxes[index]-len(line[index])+4 96 spaced_line.append(u" "*spaces) 97 return u"".join(spaced_line)
98
99 - def create_bagit_manifest(self, scan_directory, algorithm, recursive=False, delimiter = " ", filename=None):
100 """ 101 FIXME 102 @param scan_directory: 103 @type scan_directory: 104 @param algorithm: 105 @type algorithm: 106 @param recursive=False: 107 @type recursive=False: 108 @param delimiter: 109 @type delimiter: 110 @param filename=None: 111 @type filename=None: 112 """ 113 if not filename: 114 filename = "manifest-%s.txt" % algorithm 115 logger.info("Creating bagit manifest file(%s) for dir(%s) with Alg:%s" % (filename, 116 scan_directory, 117 algorithm)) 118 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=3) 119 if hasattr(filename, 'write'): 120 faked_filename = "manifest-%s.txt" % algorithm 121 for line in report: 122 if line[2] != "d": 123 if os.path.abspath(line[0]) != os.path.abspath(faked_filename): 124 filename.write("%s%s%s\n" % (line[2], delimiter, line[0])) 125 else: 126 logger.info("Manifest file match - scan line ignored") 127 else: 128 with codecs.open(filename, encoding='utf-8', mode="w") as output: 129 for line in report: 130 if line[2] != "d": 131 if os.path.abspath(line[0]) != os.path.abspath(filename): 132 output.write("%s%s%s\n" % (line[2], delimiter, line[0])) 133 else: 134 logger.info("Manifest file match - scan line ignored") 135 output.write("\n") 136 return filename
137
138 - def create_multilevel_checkm(self, top_directory, algorithm, checkm_filename, columns=3):
139 logger.info("Creating multilevel checkm files '(%s)' from top level directory(%s) with Alg:%s and columns:%s" % (checkm_filename, top_directory, algorithm, columns)) 140 if not os.path.isdir(top_directory): 141 raise NotFound(top_directory=top_directory) 142 # Gather list of directories to scan 143 # And their subdirectories 144 # bottom up! 145 dirs = dict([(root, dirnames) for (root, dirnames, _) in os.walk(top_directory, topdown=False)]) 146 # per directory 147 for dirname in dirs: 148 with codecs.open(os.path.join(dirname, checkm_filename), encoding='utf-8', mode="w") as output: 149 self.create_checkm_file(dirname, 150 algorithm, 151 os.path.join(dirname, checkm_filename), 152 recursive=False, 153 columns=columns, 154 checkm_file=output) 155 subdir_report = [] 156 for subdir in dirs[dirname]: 157 try: 158 line = self.scanner.scan_path(os.path.join(subdir, checkm_filename), algorithm, columns) 159 line[0] = '@%s' % (line[0]) 160 subdir_report.append(line) 161 except Exception, e: 162 print "Fail! %s" % e 163 col_maxes = self._get_max_len(subdir_report) 164 for line in subdir_report: 165 output.write('%s\n' % (self._space_line(line, col_maxes))) 166 output.write('\n')
167
168 - def create_checkm_file(self, scan_directory, algorithm, checkm_filename, recursive=False, columns=3, checkm_file=None):
169 logger.info("Creating checkm file for dir(%s) with Alg:%s and columns: %s" % ( 170 scan_directory, 171 algorithm, columns)) 172 report = self.scanner.scan_directory(scan_directory, algorithm, recursive=recursive, columns=columns) 173 col_maxes = self._get_max_len(report) 174 if checkm_file != None and hasattr(checkm_file, 'write'): 175 checkm_file.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 176 for line in report: 177 if os.path.abspath(line[0]) != os.path.abspath(checkm_filename): 178 checkm_file.write("%s\n" % (self._space_line(line, col_maxes))) 179 else: 180 logger.info("Manifest file match - scan line ignored") 181 return checkm_file 182 else: 183 with codecs.open(checkm_filename, encoding='utf-8', mode="w") as output: 184 output.write("%s \n" % (self._space_line(CheckmReporter.COLUMN_NAMES[:columns], col_maxes))) 185 for line in report: 186 if os.path.abspath(line[0]) != os.path.abspath(checkm_filename): 187 output.write("%s\n" % (self._space_line(line, col_maxes))) 188 else: 189 logger.info("Manifest file match - scan line ignored") 190 output.write("\n")
191
192 - def check_bagit_hashes(self, bagit_filename, algorithm=None):
193 """ 194 FIXME 195 @param bagit_filename: 196 @type bagit_filename: 197 @param algorithm=None: 198 @type algorithm=None: 199 """ 200 logger.info("Checking files against '%s' bagit manifest" % bagit_filename) 201 if algorithm == None: 202 if hasattr(bagit_filename, 'read'): 203 raise Exception("Need to supply the algorithm when passing a filelike object instead of a filename") 204 m = re.search("manifest-(?P<alg>[^\.]+)\.txt", bagit_filename) 205 if m != None: 206 algorithm = m.groupdict()['alg'] 207 parser = BagitParser(bagit_filename) 208 scanner = CheckmScanner() 209 results = {'pass':[], 'fail':{}} 210 for row in parser: 211 if row: 212 try: 213 scan_row = scanner.scan_path(row[1], algorithm, 3) 214 if row[0] != scan_row[2]: 215 logger.info("Failed original: %s" % row) 216 logger.info("Current scan: %s" % scan_row) 217 results['fail'][row[1]] = (row, scan_row) 218 else: 219 results['pass'].append(row[1]) 220 except NotFound: 221 scan_row = "File not found" 222 logger.info("Failed original: %s" % row) 223 logger.info("But file not found at this path.") 224 results['fail'][row[1]] = (row, scan_row) 225 return results
226
227 - def check_checkm_hashes(self, scan_directory, checkm_filename, ignore_multilevel=True):
228 """ 229 FIXME 230 @param scan_directory: 231 @type scan_directory: 232 @param checkm_filename: 233 @type checkm_filename: 234 """ 235 def _check_files_against_parser(parser): 236 scanner = CheckmScanner() 237 results = {'pass':[], 'fail':{}, 'include':[]} 238 for row in parser: 239 if row: 240 try: 241 if row[0].startswith('@'): 242 row[0] = row[0][1:] 243 results['include'].append(row[0]) 244 scan_row = scanner.scan_path(row[0], row[1], len(row)) 245 nomatch = False 246 for expected, scanned in zip(row, scan_row): 247 if expected != "-" and expected != scanned: 248 nomatch = True 249 if nomatch: 250 logger.info("Failed original: %s" % row) 251 logger.info("Current scan: %s" % scan_row) 252 results['fail'][row[0]] = (row, scan_row) 253 else: 254 results['pass'].append(row[0]) 255 except NotFound: 256 scan_row = "File not found" 257 logger.info("Failed original: %s" % row) 258 logger.info("But file not found at this path.") 259 results['fail'][row[0]] = (row, scan_row) 260 return results
261 262 logger.info("Checking files against %s checkm manifest" % checkm_filename) 263 parser = CheckmParser(checkm_filename) 264 results = _check_files_against_parser(parser) 265 if ignore_multilevel: 266 return results 267 else: 268 # shallow copy of the include list, as we will be pop'ing off items 269 checkm_list = results['include'][:] 270 while checkm_list: 271 checkm_file = checkm_list.pop() 272 parser = CheckmParser(checkm_file) 273 additional_results = _check_files_against_parser(parser) 274 # Add to the passes 275 results['pass'].extend(additional_results['pass']) 276 # add to the overall list of 277 results['include'].extend(additional_results['include']) 278 checkm_list.extend(additional_results['include']) 279 # add to the fail dict 280 results['fail'].update(additional_results['fail']) 281 return results
282
283 -class BagitParser(object):
284 - def __init__(self, bagit_file=None):
285 """ 286 FIXME 287 @param bagit_file=None: 288 @type bagit_file=None: 289 """ 290 self.status = False 291 self.lines = [] 292 if bagit_file: 293 self.parse(bagit_file)
294
295 - def __iter__(self):
296 """ 297 FIXME 298 """ 299 class Bagit_iter: 300 def __init__(self, lines): 301 """ 302 FIXME 303 @param lines: 304 @type lines: 305 """ 306 self.lines = lines 307 self.last = 0
308 def __iter__(self): 309 """ 310 FIXME 311 """ 312 return self
313 def next(self): 314 """ 315 FIXME 316 """ 317 if self.last >= len(self.lines): # threshhold terminator 318 raise StopIteration 319 elif len(self.lines) == 0: 320 raise StopIteration 321 else: 322 self.last += 1 323 return self.lines[self.last-1] 324 return Bagit_iter(self.lines) 325
326 - def parse(self, fileobj):
327 """ 328 FIXME 329 @param fileobj: 330 @type fileobj: 331 """ 332 if not hasattr(fileobj, "read"): 333 with codecs.open(fileobj, encoding='utf-8', mode="r") as check_fh: 334 self._parse_lines(check_fh) 335 else: 336 self._parse_lines(fileobj) 337 return self.lines
338
339 - def _parse_lines(self, fh):
340 """ 341 FIXME 342 @param fh: 343 @type fh: 344 """ 345 self.lines = [] # clear the deck 346 line_buffer = "" 347 def _parse_line(line): 348 """ 349 FIXME 350 @param line: 351 @type line: 352 """ 353 if not line.startswith('#'): 354 tokens = filter(lambda x: x, re.split("\s+", line, 1)) # 2 columns 355 logger.info(tokens) 356 if tokens: 357 # handle "\s*\*" situation 358 if tokens[1].startswith("*"): 359 tokens[1] = tokens[1][1:].strip() 360 self.lines.append(tokens)
361 for chunk in fh.read(0x1000): 362 line_buffer = line_buffer + chunk 363 while True: 364 if not line_buffer: 365 break 366 fragments = line_buffer.split('\n',1) 367 if len(fragments) == 1: 368 break 369 _parse_line(fragments[0]) 370 line_buffer = fragments[1] 371
372 -class CheckmParser(object):
373 - def __init__(self, checkm_file=None):
374 """ 375 FIXME 376 @param checkm_file=None: 377 @type checkm_file=None: 378 """ 379 self.status = False 380 self.lines = [] 381 if checkm_file: 382 self.parse(checkm_file)
383
384 - def __iter__(self):
385 """ 386 FIXME 387 """ 388 class Checkm_iter: 389 def __init__(self, lines): 390 """ 391 FIXME 392 @param lines: 393 @type lines: 394 """ 395 self.lines = lines 396 self.last = 0
397 def __iter__(self): 398 """ 399 FIXME 400 """ 401 return self
402 def next(self): 403 """ 404 FIXME 405 """ 406 if self.last >= len(self.lines): # threshhold terminator 407 raise StopIteration 408 elif len(self.lines) == 0: 409 raise StopIteration 410 else: 411 self.last += 1 412 return self.lines[self.last-1] 413 return Checkm_iter(self.lines) 414
415 - def parse(self, checkm_file):
416 """ 417 FIXME 418 @param checkm_file: 419 @type checkm_file: 420 """ 421 if not hasattr(checkm_file, "read"): 422 if os.path.isfile(checkm_file): 423 with codecs.open(checkm_file, encoding='utf-8', mode="r") as check_fh: 424 self._parse_lines(check_fh) 425 else: 426 raise NotFound(checkm_file=checkm_file) 427 else: 428 self._parse_lines(checkm_file) 429 return self.lines
430
431 - def _parse_lines(self, fh):
432 """ 433 FIXME 434 @param fh: 435 @type fh: 436 """ 437 self.lines = [] # clear the deck 438 line_buffer = "" 439 def _parse_line(line): 440 """ 441 FIXME 442 @param line: 443 @type line: 444 """ 445 if not line.startswith('#'): 446 tokens = filter(lambda x: x, re.split("\s+", line, 5)) # 6 column max defn == 5 splits 447 logger.info(tokens) 448 if tokens: 449 self.lines.append(tokens)
450 451 for chunk in fh.read(0x1000): 452 line_buffer = line_buffer + chunk 453 while True: 454 if not line_buffer: 455 break 456 fragments = line_buffer.split('\n',1) 457 if len(fragments) == 1: 458 break 459 _parse_line(fragments[0]) 460 line_buffer = fragments[1] 461
462 -class CheckmScanner(object):
463 HASHTYPES = ['md5', 'sha1', 'sha224','sha256','sha384','sha512']
464 - def scan_local(self, directory_path, algorithm, columns=3):
465 """ 466 FIXME 467 @param directory_path: 468 @type directory_path: 469 @param algorithm: 470 @type algorithm: 471 @param columns=3: 472 @type columns=3: 473 """ 474 report = [] 475 for item in os.listdir(directory_path): 476 item_path = os.path.join(directory_path, item) 477 report.append(self.scan_path(item_path, algorithm, columns)) 478 return report
479
480 - def scan_tree(self, directory_path, algorithm, columns):
481 """ 482 FIXME 483 @param directory_path: 484 @type directory_path: 485 @param algorithm: 486 @type algorithm: 487 @param columns: 488 @type columns: 489 """ 490 report = [] 491 if os.path.exists(directory_path): 492 for (dirpath, dirnames, filenames) in os.walk(directory_path): 493 for item_path in [os.path.join(dirpath, x) for x in dirnames+filenames]: 494 report.append(self.scan_path(item_path, algorithm, columns)) 495 return report 496 else: 497 raise NotFound(directory_path=directory_path, recursive=True)
498
499 - def scan_path(self, item_path, algorithm, columns):
500 """ 501 FIXME 502 @param item_path: 503 @type item_path: 504 @param algorithm: 505 @type algorithm: 506 @param columns: 507 @type columns: 508 """ 509 if columns<3 or not isinstance(columns, int): 510 columns = 3 511 try: 512 line = [] 513 # col 1 514 line.append(unicode(item_path)) 515 # col 2 516 line.append(unicode(algorithm)) 517 # col 3 518 if os.path.isdir(item_path): 519 line.append(u'd') 520 else: 521 # No need to catch the ValueError from 522 hash_gen = getattr(hashlib, algorithm)() 523 with open(item_path, 'rb') as fh: 524 logger.info("Checking %s with algorithm %s" % (item_path, algorithm)) 525 chunk = fh.read(1024*8) 526 while chunk: 527 hash_gen.update(chunk) 528 chunk= fh.read(1024*8) 529 line.append(unicode(hash_gen.hexdigest())) 530 if columns>3: 531 # col4 - Length 532 line.append(unicode(os.stat(item_path)[ST_SIZE])) 533 if columns>4: 534 # col 5 - ModTime 535 line.append(unicode(os.stat(item_path)[ST_MTIME])) 536 return line 537 except OSError: 538 raise NotFound(item_path=item_path) 539 except IOError: 540 raise NotFound(item_path=item_path) 541 except AttributeError: 542 raise ValueError("This tool cannot perform hashtype %s" % algorithm)
543
544 - def scan_directory(self, directory_path, algorithm, recursive=False, columns=3):
545 """ 546 FIXME 547 @param directory_path: 548 @type directory_path: 549 @param algorithm: 550 @type algorithm: 551 @param recursive=False: 552 @type recursive=False: 553 @param columns=3: 554 @type columns=3: 555 """ 556 if os.path.exists(directory_path): 557 if recursive: 558 return self.scan_tree(directory_path, algorithm, columns) 559 return self.scan_local(directory_path, algorithm, columns) 560 else: 561 raise NotFound(directory_path=directory_path, recursive=recursive)
562