pairtree.pairtree

41 """A client that oversees the implementation of the Pairtree FS specification 42 version 0.1. 43 44 >>> from pairtree import PairtreeStorageClient 45 >>> store = PairtreeStorageClient(store_dir='data', uri_base="http://") 46 47 This will create the following on disc in a directory called 'data' if it doesn't already exist:: 48 49 $ ls -R data/ 50 data/: 51 pairtree_prefix pairtree_root pairtree_version0_1 52 53 data/pairtree_root: 54 55 Where 56 1. the file 'pairtree_prefix' contains just "http://" 57 2. the file 'pairtree_version0_1' contains:: 58 59 This directory conforms to Pairtree Version 0.1. 60 Updated spec: http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html 61 62 Note, if data *had* already existed and was a pairtree store, the uri_base would 63 have been read from the prefix file and override the one supplied above. 64 65 Also, if you try to create a store over a directory that already exists, but which isn't 66 a pairtree store that it can recognise, it will raise a L{NotAPairtreeStoreException}. 67 """

68 - def __init__(self, uri_base, store_dir, shorty_length=2, hashing_type=None):

69 """ 70 Constructor 71 @param store_dir: The file directory where the pairtree store is 72 @type store_dir: A path to a directory, relative or absolute 73 @param uri_base: The URI base for the store 74 @type uri_base: A URI fragment, like "http://example.org/" 75 @param shorty_length: The size of the shorties in the pairtree implementation (Default: 2) 76 @type shorty_length: integer 77 @param hashing_type: The name of the algorithm to use when hashing files, if left as None, this is disabled. 78 @type hashing_type: Any supported by C{hashlib} 79 """ 80 self.store_dir = store_dir 81 self.pairtree_root = os.path.join(self.store_dir, 'pairtree_root') 82 self.uri_base = None 83 if uri_base: 84 self.uri_base = uri_base 85 self.shorty_length = shorty_length 86 self.hashing_type = hashing_type 87 # regexes 88 self._encode = re.compile(r"[\"*+,<=>?\\^|]|[^\x21-\x7e]", re.U) 89 self._decode = re.compile(r"\^(..)", re.U) 90 91 self._init_store()

92

93 - def __char2hex(self, m):

94 return ppath.char2hex(m)

95

96 - def __hex2char(self, m):

97 return ppath.hex2char(m)

98

99 - def id_encode(self, id):

100 """ 101 The identifier string is cleaned of characters that are expected to occur rarely 102 in object identifiers but that would cause certain known problems for file systems. 103 In this step, every UTF-8 octet outside the range of visible ASCII (94 characters 104 with hexadecimal codes 21-7e) [ASCII] (Cerf, “ASCII format for network interchange,” 105 October 1969.), as well as the following visible ASCII characters:: 106 107 " hex 22 < hex 3c ? hex 3f 108 * hex 2a = hex 3d ^ hex 5e 109 + hex 2b > hex 3e | hex 7c 110 , hex 2c 111 112 must be converted to their corresponding 3-character hexadecimal encoding, ^hh, 113 where ^ is a circumflex and hh is two hex digits. For example, ' ' (space) is 114 converted to ^20 and '*' to ^2a. 115 116 In the second step, the following single-character to single-character conversions 117 must be done:: 118 119 / -> = 120 : -> + 121 . -> , 122 123 These are characters that occur quite commonly in opaque identifiers but present 124 special problems for filesystems. This step avoids requiring them to be hex encoded 125 (hence expanded to three characters), which keeps the typical ppath reasonably 126 short. Here are examples of identifier strings after cleaning and after 127 ppath mapping:: 128 129 id: ark:/13030/xt12t3 130 -> ark+=13030=xt12t3 131 -> ar/k+/=1/30/30/=x/t1/2t/3/ 132 id: http://n2t.info/urn:nbn:se:kb:repos-1 133 -> http+==n2t,info=urn+nbn+se+kb+repos-1 134 -> ht/tp/+=/=n/2t/,i/nf/o=/ur/n+/n/bn/+s/e+/kb/+/re/p/os/-1/ 135 id: what-the-*@?#!^!? 136 -> what-the-^2a@^3f#!^5e!^3f 137 -> wh/at/-t/he/-^/2a/@^/3f/#!/^5/e!/^3/f/ 138 139 (From section 3 of the Pairtree specification) 140 141 @param id: Encode the given identifier according to the pairtree 0.1 specification 142 @type id: identifier 143 @returns: A string of the encoded identifier 144 """ 145 return ppath.id_encode(id)

146

147 - def id_decode(self, id):

148 """ 149 This decodes a given identifier from its pairtree filesystem encoding, into 150 its original form: 151 @param id: Identifier to decode 152 @type id: identifier 153 @returns: A string of the decoded identifier 154 """ 155 return ppath.id_decode(id)

156

157 - def _get_id_from_dirpath(self, dirpath):

158 """ 159 Internal - method for discovering the pairtree identifier for a 160 given directory path. 161 162 E.g. pairtree_root/fo/ob/ar/+/ --> 'foobar:' 163 164 @param dirpath: Directory path to decode 165 @type dirpath: Path to object's root 166 @returns: Decoded identifier 167 """ 168 #path = self._get_path_from_dirpath(dirpath) 169 #return self.id_decode("".join(path)) 170 return ppath.get_id_from_dirpath(dirpath, self.pairtree_root)

171

172 - def _get_path_from_dirpath(self, dirpath):

173 """ 174 Internal - walks a directory chain and builds a list of the directory shorties 175 relative to the pairtree_root 176 177 @param dirpath: Directory path to walk 178 @type dirpath: Directory path 179 """ 180 # head, tail = os.path.split(dirpath) 181 # path = [tail] 182 # while not self.pairtree_root == head: 183 # head, tail = os.path.split(head) 184 # path.append(tail) 185 # path.reverse() 186 # return path 187 return ppath.get_path_from_dirpath(dirpath, self.pairtree_root)

188 189

190 - def _id_to_dirpath(self, id):

191 """ 192 Internal - method for turning an identifier into a pairtree directory tree 193 of shorties. 194 195 - I{"foobar://ark.1" --> "fo/ob/ar/+=/ar/k,/1"} 196 197 @param id: Identifer for a pairtree object 198 @type id: identifier 199 @returns: A directory path to the object's root directory 200 """ 201 # return os.sep.join(self._id_to_dir_list(id)) 202 return ppath.id_to_dirpath(id, self.pairtree_root, self.shorty_length)

203

204 - def _id_to_dir_list(self, id):

205 """ 206 Internal - method for turning an identifier into a list of pairtree 207 directory tree of shorties. 208 209 - I{"foobar://ark.1" --> ["fo","ob","ar","+=","ar","k,","1"]} 210 211 @param id: Identifer for a pairtree object 212 @type id: identifier 213 @returns: A list of directory path fragments to the object's root directory 214 """ 215 # enc_id = self.id_encode(id) 216 # dirpath = [self.pairtree_root] 217 # while enc_id: 218 # dirpath.append(enc_id[:self.shorty_length]) 219 # enc_id = enc_id[self.shorty_length:] 220 # return dirpath 221 return ppath.id_to_dir_list(id, self.pairtree_root, self.shorty_length)

222

223 - def _init_store(self):

224 """ 225 Initialise the store if the directory doesn't exist. Create the basic structure 226 needed and write the prefix to disc. 227 228 If the store directory exists, one of two things can happen: 229 1. If that directory can be understood by this library as a pairtree store, 230 it will attempt to read in the correct pairtree_prefix to use, instead of 231 the supplied uri_base. 232 2. If the directory cannot be understood, a L{NotAPairtreeStoreException} will 233 be raised. 234 """ 235 if not os.path.exists(self.store_dir): 236 if self.uri_base: 237 os.mkdir(self.store_dir) 238 f = open(os.path.join(self.store_dir, "pairtree_version0_1"), "w") 239 f.write("This directory conforms to Pairtree Version 0.1. Updated spec: http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html") 240 f.close() 241 f = open(os.path.join(self.store_dir, "pairtree_prefix"),"w") 242 f.write(self.uri_base) 243 f.close() 244 os.mkdir(self.pairtree_root) 245 else: 246 raise NotAPairtreeStoreException("""No uri_base set for a non-existent 247 store - store cannot be instanciated""") 248 else: 249 if os.path.exists(os.path.join(self.store_dir, "pairtree_version0_1")): 250 """Seems to be a pairtree0_1 compliant 'store'""" 251 if os.path.exists(os.path.join(self.store_dir, "pairtree_prefix")): 252 """Read the uri base of this store""" 253 f = open(os.path.join(self.store_dir, "pairtree_prefix"),"r") 254 prefix = f.read().strip() 255 f.close() 256 self.uri_base = prefix 257 else: 258 raise NotAPairtreeStoreException 259 260 if not os.path.isdir(self.store_dir): 261 raise NotAPairtreeStoreException

262

263 - def list_ids(self):

264 """ 265 Walk the store, and build a list of pairtree conformational objects in the 266 store. This will return objects in 'split-ends' and will function correctly 267 as long as non-shortie directorys are just that; non-shortie directories must 268 have longer labels than the shorties - e.g:: 269 270 ab -- cd -- ef -- foo.txt 271 | | 272 | ---- gh 273 | | 274 | ---- foo.txt 275 | 276 ---- e -- foo.txt 277 278 This method will return ['abcdef', 'abcde', 'abcdefgh'] as ids in this 279 store. 280 281 TODO: Need to make sure this corresponds to pairtree spec. 282 283 Currently, it ignores the possibility of a split end being 284 'shielded' by a /obj/ folder 285 286 Returns a generator, not a plain list since version 0.4.12 287 288 @returns: L{generator} 289 """ 290 291 objects = set() 292 paths = [os.path.join(self.pairtree_root, x) for x in os.listdir(self.pairtree_root) if os.path.isdir(os.path.join(self.pairtree_root, x))] 293 d = None 294 if paths: 295 d = paths.pop() 296 while d: 297 for t in os.listdir(d): 298 if len(t)>self.shorty_length: 299 if self._get_id_from_dirpath(d) not in objects: 300 objects.add(self._get_id_from_dirpath(d)) 301 yield self._get_id_from_dirpath(d) 302 elif os.path.isdir(os.path.join(d, t)): 303 paths.append(os.path.join(d, t)) 304 if paths: 305 d = paths.pop() 306 else: 307 d =False

308

309 - def _create(self, id):

310 """ 311 Internal - create an object. If the object already exists, raise a 312 L{ObjectAlreadyExistsException} 313 314 @param id: Identifier to be created 315 @type id: identifier 316 @returns: L{PairtreeStorageObject} 317 """ 318 dirpath = os.path.join(self._id_to_dirpath(id)) 319 if not os.path.exists(dirpath): 320 os.makedirs(dirpath) 321 else: 322 raise ObjectAlreadyExistsException 323 return PairtreeStorageObject(id, self)

324

325 - def list_parts(self, id, path=None):

326 """ 327 List all the parts of the given identifer's parts (excluding shortie directories 328 belonging to other objects) 329 330 If path is supplied, the parts in that subdirectory are returned. 331 332 If the subpath doesn't exist, a L{ObjectNotFoundException} will be raised. 333 334 >>> store.list_parts('foobar:1', 'data/images') 335 [ 'image001.tif', 'image.... ] 336 337 @param id: Identifier for pairtree object 338 @type id: identifier 339 @param path: (Optional) List the parts contained in C{path}'s subdirectory 340 @type path: Directory path 341 @returns: L{list} 342 """ 343 dirpath = os.path.join(self._id_to_dirpath(id)) 344 if path: 345 dirpath = os.path.join(self._id_to_dirpath(id), path) 346 if not os.path.exists(dirpath): 347 raise ObjectNotFoundException 348 return [x for x in os.listdir(dirpath) if len(x)>self.shorty_length]

349

350 - def isfile(self, id, filepath):

351 """ 352 Returns True or False depending on whether the path is a file or not. 353 354 If the file doesn't exist, False is returned. 355 356 @param filepath: Path to be tested 357 @type filepath: Directory path 358 @returns: L{bool} 359 """ 360 dirpath = os.path.join(self._id_to_dirpath(id), filepath) 361 try: 362 return os.path.isfile(dirpath) 363 except OSError: 364 return False

365

366 - def isdir(self, id, filepath):

367 """ 368 Returns True or False depending on whether the path is a subdirectory or not. 369 370 If the path doesn't exist, False is returned. 371 372 @param filepath: Path to be tested 373 @type filepath: Directory path 374 @returns: L{bool} 375 """ 376 dirpath = os.path.join(self._id_to_dirpath(id), filepath) 377 try: 378 return os.path.isdir(dirpath) 379 except OSError: 380 return False

381

382 - def put_stream(self, id, path, stream_name, bytestream, buffer_size = 1024 * 8):

383 """ 384 Store a stream of bytes into a file within a pairtree object. 385 386 Can be either a string of bytes, or a filelike object which supports 387 bytestream.read(buffer_size) - useful for very large files. 388 389 @param id: Identifier for the pairtree object to write to 390 @type id: identifier 391 @param path: (Optional) subdirectory path to store file in 392 @type path: Directory path 393 @param stream_name: Name of the file to write to 394 @type stream_name: filename 395 @param bytestream: Either a string or a file-like object to read from 396 @type bytestream: string|file 397 @param buffer_size: (Optional) Used for streaming filelike objects - defines the size of the buffer 398 to read in each cycle. 399 @type buffer_size: integer 400 @returns: tuple C{(hashing_algorithm, hash)} or None if hashing is disabled 401 """ 402 dirpath = os.path.join(self._id_to_dirpath(id)) 403 if path: 404 dirpath = os.path.join(self._id_to_dirpath(id), path) 405 if not os.path.exists(dirpath): 406 os.makedirs(dirpath) 407 f = open(os.path.join(dirpath, stream_name), "wb") 408 if self.hashing_type != None: 409 hash_gen = getattr(hashlib, self.hashing_type)() 410 try: 411 # Stream file-like objects in with buffered reads 412 if hasattr(bytestream, 'read'): 413 if not buffer_size: 414 buffer_size = 1024 * 8 415 chunk = bytestream.read(buffer_size) 416 while chunk: 417 f.write(chunk) 418 if self.hashing_type != None: 419 hash_gen.update(chunk) 420 chunk = bytestream.read(buffer_size) 421 else: 422 f.write(bytestream) 423 if self.hashing_type != None: 424 hash_gen.update(bytestream) 425 finally: 426 f.close() 427 428 if self.hashing_type != None: 429 return (self.hashing_type, hash_gen.hexdigest())

430

431 - def get_appendable_stream(self, id, path, stream_name):

432 """ 433 Reads a filehandle for a pairtree object. This is a "wb+" opened file and 434 so can be appended to and obeys 'seek' 435 436 >>> with store.get_appendable_stream('foobar:1','data/images', 'image001.tif') as stream: 437 # Do something with the C{stream} handle 438 pass 439 440 stream is closed at the end of a C{with} block 441 442 @param id: Identifier for the pairtree object to read from 443 @type id: identifier 444 @param path: (Optional) subdirectory path to retrieve file from 445 @type path: Directory path 446 @param stream_name: Name of the file to read in 447 @type stream_name: filename 448 @returns: L{file} 449 """ 450 file_path = os.path.join(self._id_to_dirpath(id), stream_name) 451 if path: 452 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name) 453 f = open(file_path, "wb+") 454 return f

455

456 - def get_stream(self, id, path, stream_name, streamable=False):

457 """ 458 Reads a file from a pairtree object - If streamable is set to True, 459 this returns the filehandle for that file, which must be C{close()}'d 460 once finished with. In python 2.6 and above, this can be done easily: 461 462 >>> with store.get_stream('foobar:1','data/images', 'image001.tif', True) as stream: 463 # Do something with the C{stream} handle 464 pass 465 466 stream is closed at the end of a C{with} block 467 468 @param id: Identifier for the pairtree object to read from 469 @type id: identifier 470 @param path: (Optional) subdirectory path to retrieve file from 471 @type path: Directory path 472 @param stream_name: Name of the file to read in 473 @type stream_name: filename 474 @param streamable: If True, returns a filelike handle to C{read()} from - 475 I{remember to C{close()} the file!} If False, reads in the file into a 476 bytestring and return that instead. 477 @type streamable: True|False 478 @returns: Either L{file} or L{str} 479 """ 480 file_path = os.path.join(self._id_to_dirpath(id), stream_name) 481 if path: 482 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name) 483 if not os.path.exists(file_path): 484 raise PartNotFoundException(id=id, path=path, stream_name=stream_name,file_path=file_path) 485 f = open(file_path, "rb") 486 if streamable: 487 return f 488 else: 489 bytestream = f.read() 490 f.close() 491 return bytestream

492

493 - def del_stream(self, id, stream_name, path=None):

494 """ 495 Delete a file from a pairtree object. Leaves no trace, be careful. 496 @param id: Identifier for the pairtree object to delete from 497 @type id: identifier 498 @param path: (Optional) subdirectory path to delete file from 499 @type path: Directory path 500 @param stream_name: Name of the file to delete 501 @type stream_name: filename 502 """ 503 file_path = os.path.join(self._id_to_dirpath(id), stream_name) 504 if path: 505 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name) 506 if not os.path.exists(file_path): 507 raise PartNotFoundException(id=id, path=path, stream_name=stream_name,file_path=file_path) 508 if os.path.isdir(file_path): 509 os.rmdir(file_path) 510 else: 511 os.remove(file_path)

512

513 - def del_path(self, id, path, recursive=False):

514 """ 515 Delete a subpath from an object, and can do so recursively (optional) 516 If the path is found to be not "empty" (ie has not parts in it) and 517 recursive is not True, then it will raise a L{PathIsNotEmptyException} 518 @param id: Identifier for the pairtree object to delete from 519 @type id: identifier 520 @param path: subdirectory path to delete 521 @type path: Directory path 522 @param recursive: Whether the delete is recursive (think rm -r) 523 @type recursive: bool 524 """ 525 dirpath = os.path.join(self._id_to_dirpath(id), path) 526 if not os.path.exists(dirpath): 527 raise PartNotFoundException 528 if os.path.isfile(dirpath): 529 os.remove(dirpath) 530 else: 531 all_parts = os.listdir(dirpath) 532 deletable_parts = [x for x in all_parts if len(x)>self.shorty_length] 533 if len(all_parts) == 0: 534 os.rmdir(dirpath) 535 elif recursive: 536 for item in deletable_parts: 537 if os.path.isdir(item): 538 shutil.rmtree(os.path.join(dirpath, item)) 539 else: 540 os.remove(os.path.join(dirpath, item)) 541 if len(all_parts) == len(deletable_parts): 542 os.rmdir(dirpath) 543 elif len(deletable_parts) == 0: 544 # Directory not physically empty, but empty of parts 545 pass 546 else: 547 raise PathIsNotEmptyException

548

549 - def delete_object(self, id):

550 """ 551 Delete's an object from the pairtree store, including any parts and subpaths 552 There is no undo... 553 @param id: Identifier of the object to delete 554 @type id: identifier 555 """ 556 dirs = self._id_to_dir_list(id) 557 dirpath = os.path.join(os.sep.join(dirs)) 558 if not os.path.exists(dirpath): 559 raise ObjectNotFoundException 560 for item in self.list_parts(id): 561 self.del_path(id,item, recursive=True) 562 if not os.listdir(dirpath): 563 os.rmdir(dirpath) 564 # recursively delete up, if the directory is empty 565 leaf = dirs.pop() 566 while (not os.listdir(os.sep.join(dirs)) and os.sep.join(dirs) != self.pairtree_root): 567 os.rmdir(os.sep.join(dirs)) 568 dirs.pop()

569

570 - def exists(self, id, path=None):

571 """ 572 Answers the question "Does object or object subpath/file 'xxxxxxx' exist?" 573 574 @param id: Identifier for the pairtree object to look for 575 @type id: identifier 576 @param path: Subpath or subfilepath to check 577 @type path: Directory path 578 @returns: L{bool} 579 """ 580 dirpath = os.path.join(self._id_to_dirpath(id)) 581 if path: 582 dirpath = os.path.join(self._id_to_dirpath(id), path) 583 return os.path.exists(dirpath)

584

585 - def _get_new_id(self):

586 """ 587 Inbuilt method to randomly generate an id, if one is not given to either 588 L{get_object} or L{create_object}. 589 590 Simply returns a random 14 digit long (base 10) number, not fantastically useful 591 but at least makes sure it is unique in the store. 592 593 @returns: Random but unique 14-digit long id number 594 """ 595 id = "%.14d" % random.randint(0,99999999999999) 596 while self.exists(id): 597 id = "%.14d" % random.randint(0,99999999999999) 598 return id

599

600 - def get_object(self, id=None, create_if_doesnt_exist=True):

601 """ 602 Returns an pairtree object with identifier C{id} if it exists. 603 604 If the object at C{id} doesn't exist then depending on C{create_if_doesnt_exist}, 605 606 >>> bar = client.get_object('bar') 607 # the object with id 'bar' will be retrieved and created if necessary. 608 609 Setting this flag to False, will cause it to raise an exception if it cannot find an object. 610 611 >>> fake = client.get_object('doesnotexist', create_if_doesnt_exist=False) 612 Traceback (most recent call last): 613 File "<stdin>", line 1, in <module> 614 File "build/bdist.linux-i686/egg/pairtree/pairtree_client.py", line 231, in get_object 615 pairtree.storage_exceptions.ObjectNotFoundException 616 617 (note that fake = client.get_object('doesnotexist', False) is equivalent to the above line) 618 619 @param id: Identifier for the pairtree object to get (or create) 620 @type id: identifier 621 @param create_if_doesnt_exist: Flag - if True, an object will be created if it 622 doesn't yet exist. Will raise an L{ObjectNotFoundException} if set to False 623 and the object is non-existent. 624 @type create_if_doesnt_exist: True|False 625 @returns: L{PairtreeStorageObject} 626 """ 627 if not id: 628 id = self._get_new_id() 629 return self._create(id) 630 elif self.exists(id): 631 return PairtreeStorageObject(id, self) 632 elif create_if_doesnt_exist: 633 return self._create(id) 634 else: 635 raise ObjectNotFoundException

636

637 - def create_object(self, id):

638 """ 639 Creates a new object with identifier C{id} 640 641 >>> bar = client.create_object('bar') 642 >>> 643 644 Note that reissuing that command again will raise an L{ObjectAlreadyExistsException}: 645 646 >>> bar = client.create_object('bar') 647 Traceback (most recent call last): 648 File "<stdin>", line 1, in <module> 649 File "build/bdist.linux-i686/egg/pairtree/pairtree_client.py", line 235, in create_object 650 pairtree.storage_exceptions.ObjectAlreadyExistsException 651 652 @param id: Identifier for the pairtree object to create 653 @type id: identifier 654 @returns: L{PairtreeStorageObject} 655 """ 656 return self._create(id)

Source Code for Module pairtree.pairtree_client