dimer.archive

1 """ 2 utilities and wrappers for working with HDF archives. 3 4 * an archive contains one of more datasets 5 6 * a dataset contains 7 8 - an array X: the data 9 - an optional array Y: the raw response 10 - an optional array T: labels (targets) associated with the response 11 12 a dataset specification (dset_path) is the concatenation 13 (`:` is used as a separator) 14 of the path to the HDF file and the dataset name. All datasets hang on the root. 15 """ 16 17 18 import logging, datetime, os 19 20 import pandas as pd 21 22 import filelock 23 from . import deprecated 24 25 log = logging.getLogger(__name__) 26 log.setLevel(logging.INFO) 27 28 import re 29 __HDF_SUFFIX__ = "h5" 30 __SPEC_SEP__ = ":" 31 32 DSPEC_MSG = ("dataset specification; of the form " 33 "<hdf_archive_path>%s<dataset_name>" % __SPEC_SEP__)

34 35 36 -def __parse(path, PATT = re.compile("(.+[.]%s)[%s](.+)" % (__HDF_SUFFIX__, __SPEC_SEP__))):

37 try: 38 return PATT.match( path ).groups() 39 except AttributeError: 40 raise AttributeError("cannot parse (pattern %s) path (%s)" % (PATT.pattern, path))

41

42 -def basename(path):

43 """a dataset has the form <archname>:<basename> 44 45 @return: the archive (inside the file) part of the path 46 """ 47 48 return __parse(path)[1]

49

50 -def archname(path):

51 """a dataset has the form <archname>:<basename> 52 53 @return: the archname part of the path 54 """ 55 return __parse(path)[0]

56

57 -def split(path):

58 """a dataset has the form <archname>:<basename> 59 60 @return: (archname, basename) 61 """ 62 return __parse(path)

63

64 65 -def join(path, dsname):

66 "join archname with basename" 67 68 return __SPEC_SEP__.join( (path, dsname) )

69

70 @deprecated 71 -def this_train_name(cfg_file):

72 """train subdir inside archive""" 73 74 return "train_%s_%s" % (os.path.splitext(os.path.basename(cfg_file))[0], 75 datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))

76

77 78 -def save_object( file_path, key, obj ):

79 "atomic save operation" 80 log.debug("writing to %s:%s", file_path, key) 81 with filelock.FileLock(file_path) as lock: 82 store = pd.HDFStore( file_path ) 83 store[key] = obj 84 store.close() 85 86 ## check that it is there 87 store = pd.HDFStore( file_path ) 88 if key in store: 89 log.debug("object in %s:%s", file_path, key) 90 else: 91 log.error("POSSIBLE DATA LOSS: could save object in %s:%s", file_path, key) 92 store.close()

93

94 -def load_object( file_path, key ):

95 "atomic load operation" 96 log.debug("reading from %s:%s", file_path, key) 97 with filelock.FileLock(file_path) as lck: 98 store = pd.HDFStore( file_path ) 99 obj = store[key] 100 store.close() 101 return obj

102

103 104 -def dset_path(path):

105 "a checker for correct specification of an archive path" 106 107 return join( *__parse(path) )

108

109 110 @deprecated 111 -def get_target_dataset(path, low_idx, high_idx):

112 "load (X,Y) as numpy arrays from the archive" 113 114 log.info("loading data (and targets) %s ...", path) 115 116 store = pd.HDFStore( archname(path) ) 117 x = store[basename(path)+"/X"].values 118 y = store[basename(path)+"/T"]['label_code'].values 119 if high_idx > low_idx: 120 x = x[low_idx:high_idx] 121 y = y[low_idx:high_idx] 122 store.close() 123 return (x, y)

124

125 @deprecated 126 -def get_dataset(path, low_idx, high_idx):

127 "load X as a numpy array from the archive" 128 129 log.info("loading data %s ...", path) 130 131 store = pd.HDFStore( archname(path) ) 132 x = store[basename(path)+"/X"].values 133 if high_idx > low_idx: 134 x = x[low_idx:high_idx] 135 store.close() 136 return x

137

138 @deprecated 139 -def get_batchX(store, exp_name, _idx, batch_size, which):

140 "load a batch from the archive" 141 142 low = batch_size * _idx 143 high = low + batch_size 144 145 return store[exp_name+"/"+which].values[low : high]

146

Source Code for Module dimer.archive