Package dimer :: Module data
[hide private]
[frames] | no frames]

Source Code for Module dimer.data

  1  ''' 
  2  support for dataset IO and manipulation 
  3   
  4  @author: odenas 
  5   
  6  ''' 
  7  import logging 
  8   
  9  import numpy as np 
 10  import pandas as pd 
 11  import theano 
 12   
 13  import archive, ops 
 14   
 15  log = logging.getLogger(__name__) 
16 17 18 -class aaHDFArchive(object):
19 - def __init__(self, path):
20 self.archive, self.datapath = archive.split(path) 21 if not hasattr(self, "pX") or not hasattr(self, "X"): 22 raise ValueError("this mixin requires ")
23
24 - def dump(self, path):
25 arch = self.archive 26 key = self.datapath 27 X, Y, T = self.pX, self.sY, self.dfT 28 (nsamp, ntrack, width) = X.values.shape 29 30 if not (X is None): 31 archive.save_object( arch, "%s/rawX" % key, X ) 32 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1)) 33 34 archive.save_object( arch, "%s/X" % key, 35 pd.Panel(normX.reshape(nsamp, ntrack, width), 36 items = X.items, 37 major_axis = X.major_axis, 38 minor_axis = X.minor_axis) ) 39 archive.save_object( arch, "%s/meanX" % key, 40 pd.DataFrame(meanX, index = X.major_axis, 41 columns=X.minor_axis ) ) 42 archive.save_object( arch, "%s/sdX" % key, 43 pd.DataFrame(sdX, index = X.major_axis, 44 columns=X.minor_axis ) ) 45 46 if not (Y is None): 47 archive.save_object( arch, "%s/Y" % key, Y ) 48 49 if not (T is None): 50 archive.save_object( arch, "%s/T" % key, T )
51 52 @classmethod
53 - def _from_archive(cls, path, raw, *args):
54 ap, did = archive.split(path) 55 56 key = "%s/%s" % (did, (raw and "rawX" or "X")) 57 X = archive.load_object(ap, key) 58 59 def load_none(k, p=ap, did=did): 60 try: 61 return archive.load_object(p, "%s/%s" % (did, k)) 62 except Exception: 63 log.info("%s not found for this dataset" % k)
64 65 return cls(X, load_none("Y"), load_none("T"), *args)
66
67 -class aAnchorDataset(object):
68 """this dataset contains various tracks of epigenetic signal in the 69 for a set of genome sites (e.g., TSS-cenetered regions) all of the 70 same width. an instance maintains references to X (as a panel and ndarray), 71 Y, and T 72 """ 73
74 - def __init__(self, X, Y, T, batch_size, valid_s=None, valid_idx=None, rng=None):
75 self.pX = X 76 self.X = X.values 77 78 self.sY = Y 79 if not (Y is None): 80 self.Y = Y.values 81 82 self.dfT = T 83 if not (T is None): 84 self.T = T["label_code"].values 85 86 ## theano shared vars 87 self.__shX = None 88 self.__shY = None 89 self.__shT = None 90 91 self.batch_size = (batch_size or self.X.shape[0]) 92 93 nb = self.X.shape[0] / self.batch_size ## nr. of batches 94 self.train_batches, self.valid_batches = self.__batches(self.X.shape[0], 95 self.batch_size, 96 (valid_s or 0.25), 97 (valid_idx or nb - nb/4), 98 rng) 99 self.label_names = None 100 if not (self.dfT is None): 101 self.label_names = np.unique( self.dfT["label_name"].values ).tolist() 102 103 self.track_names = self.pX.major_axis.values.tolist()
104
105 - def __sh_anon(self, what, shape=None, borrow=True):
106 if getattr(self, what) is None: 107 raise ValueError("cannot share non-existent member %s" % what) 108 109 if getattr(self, "_AnchorDataset__sh%s" % what) is None: 110 init_val = getattr(self, what) 111 if shape: 112 init_val = init_val.reshape( shape ) 113 114 setattr(self, "_AnchorDataset__sh%s" % what, 115 theano.shared(init_val, borrow=borrow)) 116 return getattr(self, "_AnchorDataset__sh%s" % what)
117 118 @property
119 - def shX(self):
120 return self.__sh_anon("X")
121 122 @property
123 - def shY(self):
124 return self.__sh_anon("Y")
125 126 @property
127 - def shT(self):
128 return self.__sh_anon("T")
129
130 - def share(self, which, shape=None, borrow=True):
131 """wrap the data on a thean.shared variable 132 133 @param which: what component to wrap (str, typically 'X', 'T', 'Y') 134 @param shape: reshape the array to this shape 135 @param borrow: passer to theano.share 136 @return: theano.shared instance initialized to the required data""" 137 138 val = getattr(self, which) 139 if not (shape is None): 140 val = val.reshape( shape ) 141 return theano.shared(val, borrow=borrow)
142 143 144
145 - def __iter_batches(self, which, nepochs):
146 """infinite loop over train/valid batches 147 148 @param nepochs: loop this many times over train batches (0 will loop forever) 149 @return: iterator """ 150 151 assert which in ("train_batches", "valid_batches") 152 153 batches = getattr(self, which) 154 epoch = 0 155 while True: 156 for i in batches: 157 yield i 158 epoch = epoch + 1 159 if epoch == nepochs: 160 break
161
162 - def iter_train(self, nepochs):
163 return self.__iter_batches("train_batches", nepochs)
164
165 - def iter_valid(self, nepochs):
166 return self.__iter_batches("valid_batches", nepochs)
167 168 @staticmethod
169 - def __batches(tot_size, batch_s, valid_s, valid_idx, rng):
170 """create train and validation batches from the given params. 171 172 the idea is to split the data into batches and allocate a 'valid_s' 173 portion of them for validation. the position of the (continuous) validation 174 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2, 175 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T 176 177 @param tot_size: nr. of examples 178 @param batch_s : batch size 179 @param valid_s : fraction of data to allocate for validation 180 @param valid_idx: batch index at which allocate validation data 181 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle) 182 @return : (train_batches, valid_batches)""" 183 184 if valid_s <= 0 or valid_s >= 1: 185 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s) 186 187 if batch_s > tot_size * min(valid_s, 1-valid_s): 188 raise ValueError("batch size (%d) too big > min(valid_s=%d, train_s=%d)"% (batch_s, 189 tot_size * valid_s, tot_size *(1-valid_s))) 190 191 all_batches = range( tot_size / batch_s ) 192 try: 193 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)] 194 except IndexError: 195 raise ValueError("valid_idx (%d) should be between 0 and %d", 196 valid_idx, len(all_batches)-1) 197 train_batches = list( set(all_batches) - set(valid_batches) ) 198 assert set(train_batches + valid_batches) == set(all_batches) 199 assert len( set(train_batches) & set(valid_batches) ) == 0 200 if not (rng is None): 201 rng.shuffle(train_batches) 202 rng.shuffle(valid_batches) 203 logging.info("train batches: %s", str(train_batches)) 204 logging.info("valid batches: %s", str(valid_batches)) 205 return (train_batches, valid_batches)
206 207 @property
208 - def labels(self):
209 return len(self.label_names)
210 211 @property
212 - def tracks(self):
213 return len(self.track_names)
214 215 @property
216 - def width(self):
217 return self.X.shape[2]
218 219 @staticmethod
220 - def normalize_features(x):
221 """transform each component of flattened X examples to 0 mean and 1 std 222 So the values of track t at position i are 0 mean and 1 std 223 224 x: a pandas data panel of the form <anchors> X <tracks> X <genome position> 225 return: (the shifted input, 226 the mean for each input component, the sd of each input component) 227 the latter 2 are arrays of shape(<tracks>, <genome position>) 228 """ 229 230 normX, m,v = ops.standardize(x.values, axis=0) 231 pX = pd.Panel( normX, items = x.items, major_axis = x.major_axis, 232 minor_axis = x.minor_axis) 233 return pX, m.reshape(x.shape[1], -1), v.reshape(x.shape[1], -1)
234 235 @staticmethod
236 - def fit_features(x):
237 """transform each **component** of X so that it fits on an interval [-1, 1]. 238 So the values of track t at position i are all in [-1,1] 239 240 @param x: a pandas data panel of the form <anchors> X <tracks> X <genome position> 241 @return: the scaled input 242 """ 243 244 fitX = ops.fit(x.values.reshape(x.shape[0], -1), axis=0) 245 246 return pd.Panel( fitX.reshape(x.shape), 247 items = x.items, major_axis = x.major_axis, 248 minor_axis = x.minor_axis)
249
250 251 252 253 #### TODO #### 254 -class Dataset(object):
255 - def __init__(self, X, Y, T):
256 self.X, self.Y, self.T = X, Y, T 257 258 if (not (Y is None)) and X.shape[0] != Y.shape[0]: 259 raise ValueError("|X| (%d) != |Y| (%d)" % (X.shape[0], 260 Y.shape[0])) 261 if (not (T is None)) and X.shape[0] != T.shape[0]: 262 raise ValueError("|X| (%d) != |T| (%d)" % (X.shape[0], 263 T.shape[0])) 264 log.info("allocated dataset. X of shape %s, Y %s, T %s", 265 str(self.X.shape), 266 (self.Y is None and "missing" or str(self.Y.shape)), 267 (self.T is None and "missing" or str(self.T.shape)))
268 269 @property
270 - def is_labeled(self):
271 return not (self.T is None)
272 273 @property
274 - def labels(self):
275 if self.is_labeled: 276 return np.unique( self.T ).shape[0] 277 else: 278 raise AttributeError("unlabeled dataset")
279 280 @staticmethod
281 - def normalize_features(x):
282 """transform each component of flattened X examples to 0 mean and 1 std 283 So the values of feature f (from all examples) are 0 mean and 1 std 284 285 x: a ndarray of shape (nr. examples, nr. of features) 286 return: (the shifted input, 287 the mean for each input component, the sd of each input component) 288 the latter 2 are arrays of shape(<tracks>, <genome position>) 289 """ 290 291 return ops.standardize(x, axis=0)
292 293 @staticmethod
294 - def fit_features(x):
295 """transform each **component** of X so that it fits on an interval [-1, 1]. 296 So the values of track t at position i are all in [-1,1] 297 298 @param x: a ndarray of shape (nr. examples, nr. features) 299 @return: the fitted input 300 """ 301 302 return ops.fit(x, axis=0)
303
304 -class TheanoShare( object ):
305 """a dataset that can return its data as theano shared variables""" 306
307 - def __init__(self):
308 self.__shX, self.__shY, self.__shT = None, None, None
309
310 - def __sh_anon(self, what, shape=None, borrow=True):
311 if getattr(self, what) is None: 312 raise ValueError("cannot share non-existent member %s" % what) 313 314 if getattr(self, "_TheanoShare__sh%s" % what) is None: 315 init_val = getattr(self, what) 316 if shape: 317 init_val = init_val.reshape( shape ) 318 319 setattr(self, "_TheanoShare__sh%s" % what, 320 theano.shared(init_val, borrow=borrow)) 321 return getattr(self, "_TheanoShare__sh%s" % what)
322 323 @property
324 - def shX(self):
325 return self.__sh_anon("X")
326 327 @property
328 - def shY(self):
329 return self.__sh_anon("Y")
330 331 @property
332 - def shT(self):
333 return self.__sh_anon("T")
334
335 - def share(self, which, shape=None, borrow=True):
336 """wrap the data on a thean.shared variable 337 338 @param which: what component to wrap (str, typically 'X', 'T', 'Y') 339 @param shape: reshape the array to this shape 340 @param borrow: passer to theano.share 341 @return: theano.shared instance initialized to the required data""" 342 343 val = getattr(self, which) 344 if not (shape is None): 345 val = val.reshape( shape ) 346 return theano.shared(val, borrow=borrow)
347
348 -class TrainDataset( object ):
349 """a mixin for batch functionality, valid and train sub-dataset""" 350
351 - def __init__(self, batch_s, tot_s=None, valid_s=None, valid_idx=None, rng=None):
352 """Dataset that will 353 create train and validation batches from the given params. 354 355 the idea is to split the data into batches and allocate a 'valid_s' 356 portion of them for validation. the position of the (continuous) validation 357 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2, 358 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T 359 360 @param tot_s: nr. of examples 361 @param batch_s : batch size 362 @param valid_s : fraction of data to allocate for validation 363 @param valid_idx: batch index at which allocate validation data 364 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle) 365 @return : (train_batches, valid_batches)""" 366 367 if tot_s is None: 368 tot_s = self.X.shape[0] 369 if self.X.shape[0] < tot_s: 370 log.warning("total size (%d) > dataset size (%d). adjusting ...", 371 tot_s,self.X.shape[0]) 372 self.total_size = min( tot_s, self.X.shape[0] ) 373 374 self.batch_size = batch_s 375 376 if valid_s is None: 377 valid_s = 0.25 378 self.valid_size = valid_s 379 380 self.n_batches = self.X.shape[0] / self.batch_size ## nr. of batches 381 if valid_idx is None: 382 valid_idx = self.n_batches - int(self.n_batches * self.valid_size) 383 self.valid_idx = valid_idx 384 385 self.rng = rng 386 387 self.train_batches, self.valid_batches = self.__batches()
388
389 - def __batches(self):
390 tot_size = self.total_size 391 batch_s = self.batch_size 392 valid_s = self.valid_size 393 valid_idx = self.valid_idx 394 rng = self.rng 395 396 if valid_s <= 0 or valid_s >= 1: 397 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s) 398 399 if batch_s > tot_size * min(valid_s, 1-valid_s): 400 raise ValueError("batch_s (%d) > min(valid_s=%d, train_s=%d)" % (batch_s, 401 tot_size * valid_s, tot_size * (1-valid_s)) ) 402 403 all_batches = range( tot_size / batch_s ) 404 try: 405 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)] 406 except IndexError: 407 raise ValueError("valid_idx (%d) should be between 0 and %d", 408 valid_idx, len(all_batches)-1) 409 train_batches = list( set(all_batches) - set(valid_batches) ) 410 assert set(train_batches + valid_batches) == set(all_batches) 411 assert len( set(train_batches) & set(valid_batches) ) == 0 412 413 if not (rng is None): 414 rng.shuffle(train_batches) 415 rng.shuffle(valid_batches) 416 417 logging.info("train batches: %s", str(train_batches)) 418 logging.info("valid batches: %s", str(valid_batches)) 419 420 return (train_batches, valid_batches)
421 422
423 - def __iter_batches(self, which, nepochs):
424 """infinite loop over train/valid batches 425 426 @param nepochs: loop this many times over train batches (0 will loop forever) 427 @return: iterator """ 428 429 assert which in ("train_batches", "valid_batches") 430 431 batches = getattr(self, which) 432 epoch = 0 433 while True: 434 for i in batches: 435 yield i 436 epoch = epoch + 1 437 if epoch == nepochs: 438 break
439
440 - def iter_train(self, nepochs):
441 return self.__iter_batches("train_batches", nepochs)
442
443 - def iter_valid(self, nepochs):
444 return self.__iter_batches("valid_batches", nepochs)
445
446 -class AnchorAnnotation( object ):
447 - def __init__(self, anchors, tracks, width, labels):
448 self.pX = pd.Panel( self.X, items = anchors, major_axis = tracks, 449 minor_axis = width ) 450 if not (self.Y is None): 451 self.sY = pd.Series( self.Y, index=anchors ) 452 453 self.label_names = None 454 if not (self.T is None): 455 coden = map(lambda v: labels[v], self.T) 456 self.dfT = pd.DataFrame({"label_code" : self.T, 457 "label_name" : coden}) 458 self.label_names = labels
459
460 461 -class AnchorDataset(Dataset, AnchorAnnotation, TheanoShare):
462 - def __init__(self, X, Y, T):
463 """X, Y, T are a Panel, Series, and DataFrame resp.""" 464 465 valY = None 466 if not (Y is None): 467 valY = Y.values 468 valT = None 469 if not (T is None): 470 valT = T["label_code"].values 471 472 Dataset.__init__(self, X.values, valY, valT) 473 TheanoShare.__init__(self) 474 self.pX, self.sY, self.dfT = X, Y, T 475 476 self.label_names = None 477 if not (self.T is None): 478 self.label_names = np.unique( self.dfT["label_name"].values ).tolist()
479 480 @property
481 - def track_names(self):
482 return self.pX.major_axis.tolist()
483 484 @property
485 - def tracks(self):
486 return self.X.shape[1]
487 488 @property
489 - def width(self):
490 return self.X.shape[2]
491 492 @property
493 - def labels(self):
494 return len(self.label_names)
495
496 - def dump(self, path):
497 arch, key = archive.split(path) 498 499 X, Y, T = self.pX, self.sY, self.dfT 500 (nsamp, ntrack, width) = X.values.shape 501 502 if not (X is None): 503 archive.save_object( arch, "%s/rawX" % key, X ) 504 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1)) 505 506 archive.save_object( arch, "%s/X" % key, 507 pd.Panel(normX.reshape(nsamp, ntrack, width), 508 items = X.items, 509 major_axis = X.major_axis, 510 minor_axis = X.minor_axis) 511 ) 512 archive.save_object( arch, "%s/meanX" % key, 513 pd.DataFrame(meanX.reshape(ntrack, width), 514 index = X.major_axis, 515 columns=X.minor_axis ) ) 516 archive.save_object( arch, "%s/sdX" % key, 517 pd.DataFrame(sdX.reshape(ntrack, width), 518 index = X.major_axis, 519 columns=X.minor_axis ) ) 520 521 if not (Y is None): 522 archive.save_object( arch, "%s/Y" % key, Y ) 523 524 if not (T is None): 525 archive.save_object( arch, "%s/T" % key, T )
526 527 @classmethod
528 - def mean_sdX(cls, path):
529 ap, did = archive.split(path) 530 531 meanX = archive.load_object(ap, "/".join((did, "meanX"))) 532 sdX = archive.load_object(ap, "/".join((did, "sdX"))) 533 return meanX, sdX
534 535 @classmethod
536 - def _from_archive(cls, path, raw, *args):
537 ap, did = archive.split(path) 538 539 key = "%s/%s" % (did, (raw and "rawX" or "X")) 540 X = archive.load_object(ap, key) 541 542 def load_none(k, p=ap, did=did): 543 try: 544 return archive.load_object(p, "%s/%s" % (did, k)) 545 except Exception: 546 pass
547 Y, T = load_none("Y"), load_none("T") 548 return cls(X, Y, T, *args)
549
550 -class TrainAnchorDataset(AnchorDataset, TrainDataset):
551 - def __init__(self, X, Y, T, batch_s, 552 tot_s=None, valid_s=None, valid_idx=None, rng=None):
553 554 AnchorDataset.__init__(self, X, Y, T) 555 TrainDataset.__init__(self, batch_s, tot_s, valid_s, valid_idx, rng)
556 557 @classmethod
558 - def _from_archive(cls, path, raw, batch_s, **kwargs):
559 ap, did = archive.split(path) 560 561 key = "%s/%s" % (did, (raw and "rawX" or "X")) 562 X = archive.load_object(ap, key) 563 564 def load_none(k, p=ap, did=did): 565 try: 566 return archive.load_object(p, "%s/%s" % (did, k)) 567 except Exception: 568 pass
569 Y, T = load_none("Y"), load_none("T") 570 return cls(X, Y, T, batch_s, **kwargs)
571