1 """
2 utilities and wrappers for working with HDF archives.
3
4 * an archive contains one of more datasets
5
6 * a dataset contains
7
8 - an array X: the data
9 - an optional array Y: the raw response
10 - an optional array T: labels (targets) associated with the response
11
12 a dataset specification (dset_path) is the concatenation
13 (`:` is used as a separator)
14 of the path to the HDF file and the dataset name. All datasets hang on the root.
15 """
16
17
18 import logging, datetime, os
19
20 import pandas as pd
21
22 import filelock
23 from . import deprecated
24
25 log = logging.getLogger(__name__)
26 log.setLevel(logging.INFO)
27
28 import re
29 __HDF_SUFFIX__ = "h5"
30 __SPEC_SEP__ = ":"
31
32 DSPEC_MSG = ("dataset specification; of the form "
33 "<hdf_archive_path>%s<dataset_name>" % __SPEC_SEP__)
37 try:
38 return PATT.match( path ).groups()
39 except AttributeError:
40 raise AttributeError("cannot parse (pattern %s) path (%s)" % (PATT.pattern, path))
41
43 """a dataset has the form <archname>:<basename>
44
45 @return: the archive (inside the file) part of the path
46 """
47
48 return __parse(path)[1]
49
51 """a dataset has the form <archname>:<basename>
52
53 @return: the archname part of the path
54 """
55 return __parse(path)[0]
56
58 """a dataset has the form <archname>:<basename>
59
60 @return: (archname, basename)
61 """
62 return __parse(path)
63
64
65 -def join(path, dsname):
66 "join archname with basename"
67
68 return __SPEC_SEP__.join( (path, dsname) )
69
72 """train subdir inside archive"""
73
74 return "train_%s_%s" % (os.path.splitext(os.path.basename(cfg_file))[0],
75 datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
76
79 "atomic save operation"
80 log.debug("writing to %s:%s", file_path, key)
81 with filelock.FileLock(file_path) as lock:
82 store = pd.HDFStore( file_path )
83 store[key] = obj
84 store.close()
85
86
87 store = pd.HDFStore( file_path )
88 if key in store:
89 log.debug("object in %s:%s", file_path, key)
90 else:
91 log.error("POSSIBLE DATA LOSS: could save object in %s:%s", file_path, key)
92 store.close()
93
95 "atomic load operation"
96 log.debug("reading from %s:%s", file_path, key)
97 with filelock.FileLock(file_path) as lck:
98 store = pd.HDFStore( file_path )
99 obj = store[key]
100 store.close()
101 return obj
102
105 "a checker for correct specification of an archive path"
106
107 return join( *__parse(path) )
108
112 "load (X,Y) as numpy arrays from the archive"
113
114 log.info("loading data (and targets) %s ...", path)
115
116 store = pd.HDFStore( archname(path) )
117 x = store[basename(path)+"/X"].values
118 y = store[basename(path)+"/T"]['label_code'].values
119 if high_idx > low_idx:
120 x = x[low_idx:high_idx]
121 y = y[low_idx:high_idx]
122 store.close()
123 return (x, y)
124
125 @deprecated
126 -def get_dataset(path, low_idx, high_idx):
127 "load X as a numpy array from the archive"
128
129 log.info("loading data %s ...", path)
130
131 store = pd.HDFStore( archname(path) )
132 x = store[basename(path)+"/X"].values
133 if high_idx > low_idx:
134 x = x[low_idx:high_idx]
135 store.close()
136 return x
137
138 @deprecated
139 -def get_batchX(store, exp_name, _idx, batch_size, which):
140 "load a batch from the archive"
141
142 low = batch_size * _idx
143 high = low + batch_size
144
145 return store[exp_name+"/"+which].values[low : high]
146