1
2
3
4 """
5 Conventions used:
6
7 From http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html version 0.1
8
9 This client handles all of the pairtree conventions, and provides a Pairtree object
10 to make it easier to interact with.
11
12 Usage
13 =====
14
15 >>> from pairtree import PairtreeStorageClient
16
17 To create a pairtree store in I{mystore/} to hold objects which have a URI base of
18 I{http://example.org/ark:/123}
19
20 >>> store = PairtreeStorageClient(store_dir='mystore', uri_base='http://example.org/ark:/123')
21
22 """
23
24 import os, sys, shutil
25
26 import codecs
27
28 import string
29
30 import re
31
32 from storage_exceptions import *
33
34 from pairtree_object import PairtreeStorageObject
35
36 import pairtree_path as ppath
37
38 import hashlib
39
41 """A client that oversees the implementation of the Pairtree FS specification
42 version 0.1.
43
44 >>> from pairtree import PairtreeStorageClient
45 >>> store = PairtreeStorageClient(store_dir='data', uri_base="http://")
46
47 This will create the following on disc in a directory called 'data' if it doesn't already exist::
48
49 $ ls -R data/
50 data/:
51 pairtree_prefix pairtree_root pairtree_version0_1
52
53 data/pairtree_root:
54
55 Where
56 1. the file 'pairtree_prefix' contains just "http://"
57 2. the file 'pairtree_version0_1' contains::
58
59 This directory conforms to Pairtree Version 0.1.
60 Updated spec: http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html
61
62 Note, if data *had* already existed and was a pairtree store, the uri_base would
63 have been read from the prefix file and override the one supplied above.
64
65 Also, if you try to create a store over a directory that already exists, but which isn't
66 a pairtree store that it can recognise, it will raise a L{NotAPairtreeStoreException}.
67 """
68 - def __init__(self, uri_base, store_dir, shorty_length=2, hashing_type=None):
69 """
70 Constructor
71 @param store_dir: The file directory where the pairtree store is
72 @type store_dir: A path to a directory, relative or absolute
73 @param uri_base: The URI base for the store
74 @type uri_base: A URI fragment, like "http://example.org/"
75 @param shorty_length: The size of the shorties in the pairtree implementation (Default: 2)
76 @type shorty_length: integer
77 @param hashing_type: The name of the algorithm to use when hashing files, if left as None, this is disabled.
78 @type hashing_type: Any supported by C{hashlib}
79 """
80 self.store_dir = store_dir
81 self.pairtree_root = os.path.join(self.store_dir, 'pairtree_root')
82 self.uri_base = None
83 if uri_base:
84 self.uri_base = uri_base
85 self.shorty_length = shorty_length
86 self.hashing_type = hashing_type
87
88 self._encode = re.compile(r"[\"*+,<=>?\\^|]|[^\x21-\x7e]", re.U)
89 self._decode = re.compile(r"\^(..)", re.U)
90
91 self._init_store()
92
95
98
100 """
101 The identifier string is cleaned of characters that are expected to occur rarely
102 in object identifiers but that would cause certain known problems for file systems.
103 In this step, every UTF-8 octet outside the range of visible ASCII (94 characters
104 with hexadecimal codes 21-7e) [ASCII] (Cerf, “ASCII format for network interchange,”
105 October 1969.), as well as the following visible ASCII characters::
106
107 " hex 22 < hex 3c ? hex 3f
108 * hex 2a = hex 3d ^ hex 5e
109 + hex 2b > hex 3e | hex 7c
110 , hex 2c
111
112 must be converted to their corresponding 3-character hexadecimal encoding, ^hh,
113 where ^ is a circumflex and hh is two hex digits. For example, ' ' (space) is
114 converted to ^20 and '*' to ^2a.
115
116 In the second step, the following single-character to single-character conversions
117 must be done::
118
119 / -> =
120 : -> +
121 . -> ,
122
123 These are characters that occur quite commonly in opaque identifiers but present
124 special problems for filesystems. This step avoids requiring them to be hex encoded
125 (hence expanded to three characters), which keeps the typical ppath reasonably
126 short. Here are examples of identifier strings after cleaning and after
127 ppath mapping::
128
129 id: ark:/13030/xt12t3
130 -> ark+=13030=xt12t3
131 -> ar/k+/=1/30/30/=x/t1/2t/3/
132 id: http://n2t.info/urn:nbn:se:kb:repos-1
133 -> http+==n2t,info=urn+nbn+se+kb+repos-1
134 -> ht/tp/+=/=n/2t/,i/nf/o=/ur/n+/n/bn/+s/e+/kb/+/re/p/os/-1/
135 id: what-the-*@?#!^!?
136 -> what-the-^2a@^3f#!^5e!^3f
137 -> wh/at/-t/he/-^/2a/@^/3f/#!/^5/e!/^3/f/
138
139 (From section 3 of the Pairtree specification)
140
141 @param id: Encode the given identifier according to the pairtree 0.1 specification
142 @type id: identifier
143 @returns: A string of the encoded identifier
144 """
145 return ppath.id_encode(id)
146
148 """
149 This decodes a given identifier from its pairtree filesystem encoding, into
150 its original form:
151 @param id: Identifier to decode
152 @type id: identifier
153 @returns: A string of the decoded identifier
154 """
155 return ppath.id_decode(id)
156
158 """
159 Internal - method for discovering the pairtree identifier for a
160 given directory path.
161
162 E.g. pairtree_root/fo/ob/ar/+/ --> 'foobar:'
163
164 @param dirpath: Directory path to decode
165 @type dirpath: Path to object's root
166 @returns: Decoded identifier
167 """
168
169
170 return ppath.get_id_from_dirpath(dirpath, self.pairtree_root)
171
173 """
174 Internal - walks a directory chain and builds a list of the directory shorties
175 relative to the pairtree_root
176
177 @param dirpath: Directory path to walk
178 @type dirpath: Directory path
179 """
180
181
182
183
184
185
186
187 return ppath.get_path_from_dirpath(dirpath, self.pairtree_root)
188
189
191 """
192 Internal - method for turning an identifier into a pairtree directory tree
193 of shorties.
194
195 - I{"foobar://ark.1" --> "fo/ob/ar/+=/ar/k,/1"}
196
197 @param id: Identifer for a pairtree object
198 @type id: identifier
199 @returns: A directory path to the object's root directory
200 """
201
202 return ppath.id_to_dirpath(id, self.pairtree_root, self.shorty_length)
203
205 """
206 Internal - method for turning an identifier into a list of pairtree
207 directory tree of shorties.
208
209 - I{"foobar://ark.1" --> ["fo","ob","ar","+=","ar","k,","1"]}
210
211 @param id: Identifer for a pairtree object
212 @type id: identifier
213 @returns: A list of directory path fragments to the object's root directory
214 """
215
216
217
218
219
220
221 return ppath.id_to_dir_list(id, self.pairtree_root, self.shorty_length)
222
224 """
225 Initialise the store if the directory doesn't exist. Create the basic structure
226 needed and write the prefix to disc.
227
228 If the store directory exists, one of two things can happen:
229 1. If that directory can be understood by this library as a pairtree store,
230 it will attempt to read in the correct pairtree_prefix to use, instead of
231 the supplied uri_base.
232 2. If the directory cannot be understood, a L{NotAPairtreeStoreException} will
233 be raised.
234 """
235 if not os.path.exists(self.store_dir):
236 if self.uri_base:
237 os.mkdir(self.store_dir)
238 f = open(os.path.join(self.store_dir, "pairtree_version0_1"), "w")
239 f.write("This directory conforms to Pairtree Version 0.1. Updated spec: http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html")
240 f.close()
241 f = open(os.path.join(self.store_dir, "pairtree_prefix"),"w")
242 f.write(self.uri_base)
243 f.close()
244 os.mkdir(self.pairtree_root)
245 else:
246 raise NotAPairtreeStoreException("""No uri_base set for a non-existent
247 store - store cannot be instanciated""")
248 else:
249 if os.path.exists(os.path.join(self.store_dir, "pairtree_version0_1")):
250 """Seems to be a pairtree0_1 compliant 'store'"""
251 if os.path.exists(os.path.join(self.store_dir, "pairtree_prefix")):
252 """Read the uri base of this store"""
253 f = open(os.path.join(self.store_dir, "pairtree_prefix"),"r")
254 prefix = f.read().strip()
255 f.close()
256 self.uri_base = prefix
257 else:
258 raise NotAPairtreeStoreException
259
260 if not os.path.isdir(self.store_dir):
261 raise NotAPairtreeStoreException
262
264 """
265 Walk the store, and build a list of pairtree conformational objects in the
266 store. This will return objects in 'split-ends' and will function correctly
267 as long as non-shortie directorys are just that; non-shortie directories must
268 have longer labels than the shorties - e.g::
269
270 ab -- cd -- ef -- foo.txt
271 | |
272 | ---- gh
273 | |
274 | ---- foo.txt
275 |
276 ---- e -- foo.txt
277
278 This method will return ['abcdef', 'abcde', 'abcdefgh'] as ids in this
279 store.
280
281 TODO: Need to make sure this corresponds to pairtree spec.
282
283 Currently, it ignores the possibility of a split end being
284 'shielded' by a /obj/ folder
285
286 Returns a generator, not a plain list since version 0.4.12
287
288 @returns: L{generator}
289 """
290
291 objects = set()
292 paths = [os.path.join(self.pairtree_root, x) for x in os.listdir(self.pairtree_root) if os.path.isdir(os.path.join(self.pairtree_root, x))]
293 d = None
294 if paths:
295 d = paths.pop()
296 while d:
297 for t in os.listdir(d):
298 if len(t)>self.shorty_length:
299 if self._get_id_from_dirpath(d) not in objects:
300 objects.add(self._get_id_from_dirpath(d))
301 yield self._get_id_from_dirpath(d)
302 elif os.path.isdir(os.path.join(d, t)):
303 paths.append(os.path.join(d, t))
304 if paths:
305 d = paths.pop()
306 else:
307 d =False
308
310 """
311 Internal - create an object. If the object already exists, raise a
312 L{ObjectAlreadyExistsException}
313
314 @param id: Identifier to be created
315 @type id: identifier
316 @returns: L{PairtreeStorageObject}
317 """
318 dirpath = os.path.join(self._id_to_dirpath(id))
319 if not os.path.exists(dirpath):
320 os.makedirs(dirpath)
321 else:
322 raise ObjectAlreadyExistsException
323 return PairtreeStorageObject(id, self)
324
326 """
327 List all the parts of the given identifer's parts (excluding shortie directories
328 belonging to other objects)
329
330 If path is supplied, the parts in that subdirectory are returned.
331
332 If the subpath doesn't exist, a L{ObjectNotFoundException} will be raised.
333
334 >>> store.list_parts('foobar:1', 'data/images')
335 [ 'image001.tif', 'image.... ]
336
337 @param id: Identifier for pairtree object
338 @type id: identifier
339 @param path: (Optional) List the parts contained in C{path}'s subdirectory
340 @type path: Directory path
341 @returns: L{list}
342 """
343 dirpath = os.path.join(self._id_to_dirpath(id))
344 if path:
345 dirpath = os.path.join(self._id_to_dirpath(id), path)
346 if not os.path.exists(dirpath):
347 raise ObjectNotFoundException
348 return [x for x in os.listdir(dirpath) if len(x)>self.shorty_length]
349
350 - def isfile(self, id, filepath):
351 """
352 Returns True or False depending on whether the path is a file or not.
353
354 If the file doesn't exist, False is returned.
355
356 @param filepath: Path to be tested
357 @type filepath: Directory path
358 @returns: L{bool}
359 """
360 dirpath = os.path.join(self._id_to_dirpath(id), filepath)
361 try:
362 return os.path.isfile(dirpath)
363 except OSError:
364 return False
365
366 - def isdir(self, id, filepath):
367 """
368 Returns True or False depending on whether the path is a subdirectory or not.
369
370 If the path doesn't exist, False is returned.
371
372 @param filepath: Path to be tested
373 @type filepath: Directory path
374 @returns: L{bool}
375 """
376 dirpath = os.path.join(self._id_to_dirpath(id), filepath)
377 try:
378 return os.path.isdir(dirpath)
379 except OSError:
380 return False
381
382 - def put_stream(self, id, path, stream_name, bytestream, buffer_size = 1024 * 8):
383 """
384 Store a stream of bytes into a file within a pairtree object.
385
386 Can be either a string of bytes, or a filelike object which supports
387 bytestream.read(buffer_size) - useful for very large files.
388
389 @param id: Identifier for the pairtree object to write to
390 @type id: identifier
391 @param path: (Optional) subdirectory path to store file in
392 @type path: Directory path
393 @param stream_name: Name of the file to write to
394 @type stream_name: filename
395 @param bytestream: Either a string or a file-like object to read from
396 @type bytestream: string|file
397 @param buffer_size: (Optional) Used for streaming filelike objects - defines the size of the buffer
398 to read in each cycle.
399 @type buffer_size: integer
400 @returns: tuple C{(hashing_algorithm, hash)} or None if hashing is disabled
401 """
402 dirpath = os.path.join(self._id_to_dirpath(id))
403 if path:
404 dirpath = os.path.join(self._id_to_dirpath(id), path)
405 if not os.path.exists(dirpath):
406 os.makedirs(dirpath)
407 f = open(os.path.join(dirpath, stream_name), "wb")
408 if self.hashing_type != None:
409 hash_gen = getattr(hashlib, self.hashing_type)()
410 try:
411
412 if hasattr(bytestream, 'read'):
413 if not buffer_size:
414 buffer_size = 1024 * 8
415 chunk = bytestream.read(buffer_size)
416 while chunk:
417 f.write(chunk)
418 if self.hashing_type != None:
419 hash_gen.update(chunk)
420 chunk = bytestream.read(buffer_size)
421 else:
422 f.write(bytestream)
423 if self.hashing_type != None:
424 hash_gen.update(bytestream)
425 finally:
426 f.close()
427
428 if self.hashing_type != None:
429 return (self.hashing_type, hash_gen.hexdigest())
430
432 """
433 Reads a filehandle for a pairtree object. This is a "wb+" opened file and
434 so can be appended to and obeys 'seek'
435
436 >>> with store.get_appendable_stream('foobar:1','data/images', 'image001.tif') as stream:
437 # Do something with the C{stream} handle
438 pass
439
440 stream is closed at the end of a C{with} block
441
442 @param id: Identifier for the pairtree object to read from
443 @type id: identifier
444 @param path: (Optional) subdirectory path to retrieve file from
445 @type path: Directory path
446 @param stream_name: Name of the file to read in
447 @type stream_name: filename
448 @returns: L{file}
449 """
450 file_path = os.path.join(self._id_to_dirpath(id), stream_name)
451 if path:
452 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name)
453 f = open(file_path, "wb+")
454 return f
455
456 - def get_stream(self, id, path, stream_name, streamable=False):
457 """
458 Reads a file from a pairtree object - If streamable is set to True,
459 this returns the filehandle for that file, which must be C{close()}'d
460 once finished with. In python 2.6 and above, this can be done easily:
461
462 >>> with store.get_stream('foobar:1','data/images', 'image001.tif', True) as stream:
463 # Do something with the C{stream} handle
464 pass
465
466 stream is closed at the end of a C{with} block
467
468 @param id: Identifier for the pairtree object to read from
469 @type id: identifier
470 @param path: (Optional) subdirectory path to retrieve file from
471 @type path: Directory path
472 @param stream_name: Name of the file to read in
473 @type stream_name: filename
474 @param streamable: If True, returns a filelike handle to C{read()} from -
475 I{remember to C{close()} the file!} If False, reads in the file into a
476 bytestring and return that instead.
477 @type streamable: True|False
478 @returns: Either L{file} or L{str}
479 """
480 file_path = os.path.join(self._id_to_dirpath(id), stream_name)
481 if path:
482 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name)
483 if not os.path.exists(file_path):
484 raise PartNotFoundException(id=id, path=path, stream_name=stream_name,file_path=file_path)
485 f = open(file_path, "rb")
486 if streamable:
487 return f
488 else:
489 bytestream = f.read()
490 f.close()
491 return bytestream
492
493 - def del_stream(self, id, stream_name, path=None):
494 """
495 Delete a file from a pairtree object. Leaves no trace, be careful.
496 @param id: Identifier for the pairtree object to delete from
497 @type id: identifier
498 @param path: (Optional) subdirectory path to delete file from
499 @type path: Directory path
500 @param stream_name: Name of the file to delete
501 @type stream_name: filename
502 """
503 file_path = os.path.join(self._id_to_dirpath(id), stream_name)
504 if path:
505 file_path = os.path.join(self._id_to_dirpath(id), path, stream_name)
506 if not os.path.exists(file_path):
507 raise PartNotFoundException(id=id, path=path, stream_name=stream_name,file_path=file_path)
508 if os.path.isdir(file_path):
509 os.rmdir(file_path)
510 else:
511 os.remove(file_path)
512
513 - def del_path(self, id, path, recursive=False):
514 """
515 Delete a subpath from an object, and can do so recursively (optional)
516 If the path is found to be not "empty" (ie has not parts in it) and
517 recursive is not True, then it will raise a L{PathIsNotEmptyException}
518 @param id: Identifier for the pairtree object to delete from
519 @type id: identifier
520 @param path: subdirectory path to delete
521 @type path: Directory path
522 @param recursive: Whether the delete is recursive (think rm -r)
523 @type recursive: bool
524 """
525 dirpath = os.path.join(self._id_to_dirpath(id), path)
526 if not os.path.exists(dirpath):
527 raise PartNotFoundException
528 if os.path.isfile(dirpath):
529 os.remove(dirpath)
530 else:
531 all_parts = os.listdir(dirpath)
532 deletable_parts = [x for x in all_parts if len(x)>self.shorty_length]
533 if len(all_parts) == 0:
534 os.rmdir(dirpath)
535 elif recursive:
536 for item in deletable_parts:
537 if os.path.isdir(item):
538 shutil.rmtree(os.path.join(dirpath, item))
539 else:
540 os.remove(os.path.join(dirpath, item))
541 if len(all_parts) == len(deletable_parts):
542 os.rmdir(dirpath)
543 elif len(deletable_parts) == 0:
544
545 pass
546 else:
547 raise PathIsNotEmptyException
548
550 """
551 Delete's an object from the pairtree store, including any parts and subpaths
552 There is no undo...
553 @param id: Identifier of the object to delete
554 @type id: identifier
555 """
556 dirs = self._id_to_dir_list(id)
557 dirpath = os.path.join(os.sep.join(dirs))
558 if not os.path.exists(dirpath):
559 raise ObjectNotFoundException
560 for item in self.list_parts(id):
561 self.del_path(id,item, recursive=True)
562 if not os.listdir(dirpath):
563 os.rmdir(dirpath)
564
565 leaf = dirs.pop()
566 while (not os.listdir(os.sep.join(dirs)) and os.sep.join(dirs) != self.pairtree_root):
567 os.rmdir(os.sep.join(dirs))
568 dirs.pop()
569
570 - def exists(self, id, path=None):
571 """
572 Answers the question "Does object or object subpath/file 'xxxxxxx' exist?"
573
574 @param id: Identifier for the pairtree object to look for
575 @type id: identifier
576 @param path: Subpath or subfilepath to check
577 @type path: Directory path
578 @returns: L{bool}
579 """
580 dirpath = os.path.join(self._id_to_dirpath(id))
581 if path:
582 dirpath = os.path.join(self._id_to_dirpath(id), path)
583 return os.path.exists(dirpath)
584
586 """
587 Inbuilt method to randomly generate an id, if one is not given to either
588 L{get_object} or L{create_object}.
589
590 Simply returns a random 14 digit long (base 10) number, not fantastically useful
591 but at least makes sure it is unique in the store.
592
593 @returns: Random but unique 14-digit long id number
594 """
595 id = "%.14d" % random.randint(0,99999999999999)
596 while self.exists(id):
597 id = "%.14d" % random.randint(0,99999999999999)
598 return id
599
600 - def get_object(self, id=None, create_if_doesnt_exist=True):
601 """
602 Returns an pairtree object with identifier C{id} if it exists.
603
604 If the object at C{id} doesn't exist then depending on C{create_if_doesnt_exist},
605
606 >>> bar = client.get_object('bar')
607 # the object with id 'bar' will be retrieved and created if necessary.
608
609 Setting this flag to False, will cause it to raise an exception if it cannot find an object.
610
611 >>> fake = client.get_object('doesnotexist', create_if_doesnt_exist=False)
612 Traceback (most recent call last):
613 File "<stdin>", line 1, in <module>
614 File "build/bdist.linux-i686/egg/pairtree/pairtree_client.py", line 231, in get_object
615 pairtree.storage_exceptions.ObjectNotFoundException
616
617 (note that fake = client.get_object('doesnotexist', False) is equivalent to the above line)
618
619 @param id: Identifier for the pairtree object to get (or create)
620 @type id: identifier
621 @param create_if_doesnt_exist: Flag - if True, an object will be created if it
622 doesn't yet exist. Will raise an L{ObjectNotFoundException} if set to False
623 and the object is non-existent.
624 @type create_if_doesnt_exist: True|False
625 @returns: L{PairtreeStorageObject}
626 """
627 if not id:
628 id = self._get_new_id()
629 return self._create(id)
630 elif self.exists(id):
631 return PairtreeStorageObject(id, self)
632 elif create_if_doesnt_exist:
633 return self._create(id)
634 else:
635 raise ObjectNotFoundException
636
638 """
639 Creates a new object with identifier C{id}
640
641 >>> bar = client.create_object('bar')
642 >>>
643
644 Note that reissuing that command again will raise an L{ObjectAlreadyExistsException}:
645
646 >>> bar = client.create_object('bar')
647 Traceback (most recent call last):
648 File "<stdin>", line 1, in <module>
649 File "build/bdist.linux-i686/egg/pairtree/pairtree_client.py", line 235, in create_object
650 pairtree.storage_exceptions.ObjectAlreadyExistsException
651
652 @param id: Identifier for the pairtree object to create
653 @type id: identifier
654 @returns: L{PairtreeStorageObject}
655 """
656 return self._create(id)
657