Package pairtree :: Module pairtree_path
[hide private]
[frames] | no frames]

Source Code for Module pairtree.pairtree_path

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  Conventions used: 
  6   
  7  From http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html version 0.1 
  8   
  9  This client handles all of the pairtree conventions, and provides a Pairtree object 
 10  to make it easier to interact with. 
 11   
 12  Usage 
 13  ===== 
 14   
 15  >>> from pairtree import PairtreeStorageClient 
 16   
 17  To create a pairtree store in I{mystore/} to hold objects which have a URI base of 
 18  I{http://example.org/ark:/123} 
 19   
 20  >>> store = PairtreeStorageClient(store_dir='mystore', uri_base='http://example.org/ark:/123') 
 21   
 22  """ 
 23   
 24  import os, sys, shutil 
 25   
 26  import codecs 
 27   
 28  import string 
 29   
 30  import re 
 31   
 32  from storage_exceptions import * 
 33   
 34  import logging 
 35   
 36  logging.basicConfig(level=logging.INFO) 
 37   
 38  logger = logging.getLogger('pairtreepath') 
 39   
 40  encode_regex = re.compile(r"[\"*+,<=>?\\^|]|[^\x21-\x7e]", re.U) 
 41  decode_regex = re.compile(r"\^(..)", re.U) 
 42   
43 -def char2hex(m):
44 return "^%02x"%ord(m.group(0))
45
46 -def hex2char(m):
47 return chr(int(m.group(1), 16))
48 49
50 -def id_encode(id):
51 """ 52 The identifier string is cleaned of characters that are expected to occur rarely 53 in object identifiers but that would cause certain known problems for file systems. 54 In this step, every UTF-8 octet outside the range of visible ASCII (94 characters 55 with hexadecimal codes 21-7e) [ASCII] (Cerf, “ASCII format for network interchange,” 56 October 1969.), as well as the following visible ASCII characters:: 57 58 " hex 22 < hex 3c ? hex 3f 59 * hex 2a = hex 3d ^ hex 5e 60 + hex 2b > hex 3e | hex 7c 61 , hex 2c 62 63 must be converted to their corresponding 3-character hexadecimal encoding, ^hh, 64 where ^ is a circumflex and hh is two hex digits. For example, ' ' (space) is 65 converted to ^20 and '*' to ^2a. 66 67 In the second step, the following single-character to single-character conversions 68 must be done:: 69 70 / -> = 71 : -> + 72 . -> , 73 74 These are characters that occur quite commonly in opaque identifiers but present 75 special problems for filesystems. This step avoids requiring them to be hex encoded 76 (hence expanded to three characters), which keeps the typical ppath reasonably 77 short. Here are examples of identifier strings after cleaning and after 78 ppath mapping:: 79 80 id: ark:/13030/xt12t3 81 -> ark+=13030=xt12t3 82 -> ar/k+/=1/30/30/=x/t1/2t/3/ 83 id: http://n2t.info/urn:nbn:se:kb:repos-1 84 -> http+==n2t,info=urn+nbn+se+kb+repos-1 85 -> ht/tp/+=/=n/2t/,i/nf/o=/ur/n+/n/bn/+s/e+/kb/+/re/p/os/-1/ 86 id: what-the-*@?#!^!? 87 -> what-the-^2a@^3f#!^5e!^3f 88 -> wh/at/-t/he/-^/2a/@^/3f/#!/^5/e!/^3/f/ 89 90 (From section 3 of the Pairtree specification) 91 92 @param id: Encode the given identifier according to the pairtree 0.1 specification 93 @type id: identifier 94 @returns: A string of the encoded identifier 95 """ 96 # Unicode or bust 97 if isinstance(id, unicode): 98 # assume utf-8 99 # TODO - not assume encoding 100 id = id.encode('utf-8') 101 102 second_pass_m = {'/':'=', 103 ':':'+', 104 '.':',' 105 } 106 # hexify the odd characters 107 # Using Erik Hetzner's regex in place of my previous hack 108 new_id = encode_regex.sub(char2hex, id) 109 110 # 2nd pass 111 second_pass = [] 112 for char in new_id: 113 second_pass.append(second_pass_m.get(char, char)) 114 return "".join(second_pass)
115
116 -def id_decode(id):
117 """ 118 This decodes a given identifier from its pairtree filesystem encoding, into 119 its original form: 120 @param id: Identifier to decode 121 @type id: identifier 122 @returns: A string of the decoded identifier 123 """ 124 second_pass_m = {'=':'/', 125 '+':':', 126 ',':'.' 127 } 128 second_pass = [] 129 for char in id: 130 second_pass.append(second_pass_m.get(char, char)) 131 dec_id = "".join(second_pass) 132 #dec_id = id.translate(string.maketrans(u'=+,',u'/:.')) 133 # Using Erik Hetzner's regex in place of my previous hack 134 #ppath_s = re.sub(r"\^(..)", self.__hex2char, dec_id) 135 ppath_s = decode_regex.sub(hex2char, dec_id) 136 # Again, drop the assumption of utf-8 137 return ppath_s.decode('utf-8')
138 139
140 -def get_id_from_dirpath(dirpath, pairtree_root=""):
141 """ 142 Internal - method for discovering the pairtree identifier for a 143 given directory path. 144 145 E.g. pairtree_root/fo/ob/ar/+/ --> 'foobar:' 146 147 @param dirpath: Directory path to decode 148 @type dirpath: Path to object's root 149 @returns: Decoded identifier 150 """ 151 path = get_path_from_dirpath(dirpath, pairtree_root) 152 return id_decode("".join(path))
153
154 -def get_path_from_dirpath(dirpath, pairtree_root=""):
155 """ 156 Internal - walks a directory chain and builds a list of the directory shorties 157 relative to the pairtree_root 158 159 @param dirpath: Directory path to walk 160 @type dirpath: Directory path 161 """ 162 head, tail = os.path.split(dirpath) 163 path = [tail] 164 while not pairtree_root == head: 165 head, tail = os.path.split(head) 166 path.append(tail) 167 path.reverse() 168 return path
169
170 -def id_to_dirpath(id, pairtree_root="", shorty_length=2):
171 """ 172 Internal - method for turning an identifier into a pairtree directory tree 173 of shorties. 174 175 - I{"foobar://ark.1" --> "fo/ob/ar/+=/ar/k,/1"} 176 177 @param id: Identifer for a pairtree object 178 @type id: identifier 179 @returns: A directory path to the object's root directory 180 """ 181 return os.sep.join(id_to_dir_list(id, pairtree_root, shorty_length))
182 183
184 -def id_to_dir_list(id, pairtree_root="", shorty_length=2):
185 """ 186 Internal - method for turning an identifier into a list of pairtree 187 directory tree of shorties. 188 189 - I{"foobar://ark.1" --> ["fo","ob","ar","+=","ar","k,","1"]} 190 191 @param id: Identifer for a pairtree object 192 @type id: identifier 193 @returns: A list of directory path fragments to the object's root directory 194 """ 195 enc_id = id_encode(id) 196 dirpath = [] 197 if pairtree_root: 198 dirpath = [pairtree_root] 199 while enc_id: 200 dirpath.append(enc_id[:shorty_length]) 201 enc_id = enc_id[shorty_length:] 202 return dirpath
203