1
2
3
4 """
5 Conventions used:
6
7 From http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html version 0.1
8
9 This client handles all of the pairtree conventions, and provides a Pairtree object
10 to make it easier to interact with.
11
12 Usage
13 =====
14
15 >>> from pairtree import PairtreeStorageClient
16
17 To create a pairtree store in I{mystore/} to hold objects which have a URI base of
18 I{http://example.org/ark:/123}
19
20 >>> store = PairtreeStorageClient(store_dir='mystore', uri_base='http://example.org/ark:/123')
21
22 """
23
24 import os, sys, shutil
25
26 import codecs
27
28 import string
29
30 import re
31
32 from storage_exceptions import *
33
34 import logging
35
36 logging.basicConfig(level=logging.INFO)
37
38 logger = logging.getLogger('pairtreepath')
39
40 encode_regex = re.compile(r"[\"*+,<=>?\\^|]|[^\x21-\x7e]", re.U)
41 decode_regex = re.compile(r"\^(..)", re.U)
42
44 return "^%02x"%ord(m.group(0))
45
47 return chr(int(m.group(1), 16))
48
49
51 """
52 The identifier string is cleaned of characters that are expected to occur rarely
53 in object identifiers but that would cause certain known problems for file systems.
54 In this step, every UTF-8 octet outside the range of visible ASCII (94 characters
55 with hexadecimal codes 21-7e) [ASCII] (Cerf, “ASCII format for network interchange,”
56 October 1969.), as well as the following visible ASCII characters::
57
58 " hex 22 < hex 3c ? hex 3f
59 * hex 2a = hex 3d ^ hex 5e
60 + hex 2b > hex 3e | hex 7c
61 , hex 2c
62
63 must be converted to their corresponding 3-character hexadecimal encoding, ^hh,
64 where ^ is a circumflex and hh is two hex digits. For example, ' ' (space) is
65 converted to ^20 and '*' to ^2a.
66
67 In the second step, the following single-character to single-character conversions
68 must be done::
69
70 / -> =
71 : -> +
72 . -> ,
73
74 These are characters that occur quite commonly in opaque identifiers but present
75 special problems for filesystems. This step avoids requiring them to be hex encoded
76 (hence expanded to three characters), which keeps the typical ppath reasonably
77 short. Here are examples of identifier strings after cleaning and after
78 ppath mapping::
79
80 id: ark:/13030/xt12t3
81 -> ark+=13030=xt12t3
82 -> ar/k+/=1/30/30/=x/t1/2t/3/
83 id: http://n2t.info/urn:nbn:se:kb:repos-1
84 -> http+==n2t,info=urn+nbn+se+kb+repos-1
85 -> ht/tp/+=/=n/2t/,i/nf/o=/ur/n+/n/bn/+s/e+/kb/+/re/p/os/-1/
86 id: what-the-*@?#!^!?
87 -> what-the-^2a@^3f#!^5e!^3f
88 -> wh/at/-t/he/-^/2a/@^/3f/#!/^5/e!/^3/f/
89
90 (From section 3 of the Pairtree specification)
91
92 @param id: Encode the given identifier according to the pairtree 0.1 specification
93 @type id: identifier
94 @returns: A string of the encoded identifier
95 """
96
97 if isinstance(id, unicode):
98
99
100 id = id.encode('utf-8')
101
102 second_pass_m = {'/':'=',
103 ':':'+',
104 '.':','
105 }
106
107
108 new_id = encode_regex.sub(char2hex, id)
109
110
111 second_pass = []
112 for char in new_id:
113 second_pass.append(second_pass_m.get(char, char))
114 return "".join(second_pass)
115
117 """
118 This decodes a given identifier from its pairtree filesystem encoding, into
119 its original form:
120 @param id: Identifier to decode
121 @type id: identifier
122 @returns: A string of the decoded identifier
123 """
124 second_pass_m = {'=':'/',
125 '+':':',
126 ',':'.'
127 }
128 second_pass = []
129 for char in id:
130 second_pass.append(second_pass_m.get(char, char))
131 dec_id = "".join(second_pass)
132
133
134
135 ppath_s = decode_regex.sub(hex2char, dec_id)
136
137 return ppath_s.decode('utf-8')
138
139
141 """
142 Internal - method for discovering the pairtree identifier for a
143 given directory path.
144
145 E.g. pairtree_root/fo/ob/ar/+/ --> 'foobar:'
146
147 @param dirpath: Directory path to decode
148 @type dirpath: Path to object's root
149 @returns: Decoded identifier
150 """
151 path = get_path_from_dirpath(dirpath, pairtree_root)
152 return id_decode("".join(path))
153
155 """
156 Internal - walks a directory chain and builds a list of the directory shorties
157 relative to the pairtree_root
158
159 @param dirpath: Directory path to walk
160 @type dirpath: Directory path
161 """
162 head, tail = os.path.split(dirpath)
163 path = [tail]
164 while not pairtree_root == head:
165 head, tail = os.path.split(head)
166 path.append(tail)
167 path.reverse()
168 return path
169
171 """
172 Internal - method for turning an identifier into a pairtree directory tree
173 of shorties.
174
175 - I{"foobar://ark.1" --> "fo/ob/ar/+=/ar/k,/1"}
176
177 @param id: Identifer for a pairtree object
178 @type id: identifier
179 @returns: A directory path to the object's root directory
180 """
181 return os.sep.join(id_to_dir_list(id, pairtree_root, shorty_length))
182
183
185 """
186 Internal - method for turning an identifier into a list of pairtree
187 directory tree of shorties.
188
189 - I{"foobar://ark.1" --> ["fo","ob","ar","+=","ar","k,","1"]}
190
191 @param id: Identifer for a pairtree object
192 @type id: identifier
193 @returns: A list of directory path fragments to the object's root directory
194 """
195 enc_id = id_encode(id)
196 dirpath = []
197 if pairtree_root:
198 dirpath = [pairtree_root]
199 while enc_id:
200 dirpath.append(enc_id[:shorty_length])
201 enc_id = enc_id[shorty_length:]
202 return dirpath
203