Package dimer :: Module archive
[hide private]
[frames] | no frames]

Source Code for Module dimer.archive

  1  """ 
  2  utilities and wrappers for working with HDF archives. 
  3   
  4  * an archive contains one of more datasets 
  5   
  6  * a dataset contains 
  7   
  8    - an array X: the data 
  9    - an optional array Y: the raw response 
 10    - an optional array T: labels (targets) associated with the response 
 11   
 12  a dataset specification (dset_path) is the concatenation 
 13  (`:` is used as a separator) 
 14  of the path to the HDF file and the dataset name. All datasets hang on the root. 
 15  """ 
 16   
 17   
 18  import logging, datetime, os 
 19   
 20  import pandas as pd 
 21   
 22  import filelock 
 23  from . import deprecated 
 24   
 25  log = logging.getLogger(__name__) 
 26  log.setLevel(logging.INFO) 
 27   
 28  import re 
 29  __HDF_SUFFIX__ = "h5" 
 30  __SPEC_SEP__ = ":" 
 31   
 32  DSPEC_MSG = ("dataset specification; of the form " 
 33               "<hdf_archive_path>%s<dataset_name>" % __SPEC_SEP__) 
34 35 36 -def __parse(path, PATT = re.compile("(.+[.]%s)[%s](.+)" % (__HDF_SUFFIX__, __SPEC_SEP__))):
37 try: 38 return PATT.match( path ).groups() 39 except AttributeError: 40 raise AttributeError("cannot parse (pattern %s) path (%s)" % (PATT.pattern, path))
41
42 -def basename(path):
43 """a dataset has the form <archname>:<basename> 44 45 @return: the archive (inside the file) part of the path 46 """ 47 48 return __parse(path)[1]
49
50 -def archname(path):
51 """a dataset has the form <archname>:<basename> 52 53 @return: the archname part of the path 54 """ 55 return __parse(path)[0]
56
57 -def split(path):
58 """a dataset has the form <archname>:<basename> 59 60 @return: (archname, basename) 61 """ 62 return __parse(path)
63
64 65 -def join(path, dsname):
66 "join archname with basename" 67 68 return __SPEC_SEP__.join( (path, dsname) )
69
70 @deprecated 71 -def this_train_name(cfg_file):
72 """train subdir inside archive""" 73 74 return "train_%s_%s" % (os.path.splitext(os.path.basename(cfg_file))[0], 75 datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
76
77 78 -def save_object( file_path, key, obj ):
79 "atomic save operation" 80 log.debug("writing to %s:%s", file_path, key) 81 with filelock.FileLock(file_path) as lock: 82 store = pd.HDFStore( file_path ) 83 store[key] = obj 84 store.close() 85 86 ## check that it is there 87 store = pd.HDFStore( file_path ) 88 if key in store: 89 log.debug("object in %s:%s", file_path, key) 90 else: 91 log.error("POSSIBLE DATA LOSS: could save object in %s:%s", file_path, key) 92 store.close()
93
94 -def load_object( file_path, key ):
95 "atomic load operation" 96 log.debug("reading from %s:%s", file_path, key) 97 with filelock.FileLock(file_path) as lck: 98 store = pd.HDFStore( file_path ) 99 obj = store[key] 100 store.close() 101 return obj
102
103 104 -def dset_path(path):
105 "a checker for correct specification of an archive path" 106 107 return join( *__parse(path) )
108
109 110 @deprecated 111 -def get_target_dataset(path, low_idx, high_idx):
112 "load (X,Y) as numpy arrays from the archive" 113 114 log.info("loading data (and targets) %s ...", path) 115 116 store = pd.HDFStore( archname(path) ) 117 x = store[basename(path)+"/X"].values 118 y = store[basename(path)+"/T"]['label_code'].values 119 if high_idx > low_idx: 120 x = x[low_idx:high_idx] 121 y = y[low_idx:high_idx] 122 store.close() 123 return (x, y)
124
125 @deprecated 126 -def get_dataset(path, low_idx, high_idx):
127 "load X as a numpy array from the archive" 128 129 log.info("loading data %s ...", path) 130 131 store = pd.HDFStore( archname(path) ) 132 x = store[basename(path)+"/X"].values 133 if high_idx > low_idx: 134 x = x[low_idx:high_idx] 135 store.close() 136 return x
137
138 @deprecated 139 -def get_batchX(store, exp_name, _idx, batch_size, which):
140 "load a batch from the archive" 141 142 low = batch_size * _idx 143 high = low + batch_size 144 145 return store[exp_name+"/"+which].values[low : high]
146