dimer.data

1 ''' 2 support for dataset IO and manipulation 3 4 @author: odenas 5 6 ''' 7 import logging 8 9 import numpy as np 10 import pandas as pd 11 import theano 12 13 import archive, ops 14 15 log = logging.getLogger(__name__)

16 17 18 -class aaHDFArchive(object):

19 - def __init__(self, path):

20 self.archive, self.datapath = archive.split(path) 21 if not hasattr(self, "pX") or not hasattr(self, "X"): 22 raise ValueError("this mixin requires ")

23

24 - def dump(self, path):

25 arch = self.archive 26 key = self.datapath 27 X, Y, T = self.pX, self.sY, self.dfT 28 (nsamp, ntrack, width) = X.values.shape 29 30 if not (X is None): 31 archive.save_object( arch, "%s/rawX" % key, X ) 32 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1)) 33 34 archive.save_object( arch, "%s/X" % key, 35 pd.Panel(normX.reshape(nsamp, ntrack, width), 36 items = X.items, 37 major_axis = X.major_axis, 38 minor_axis = X.minor_axis) ) 39 archive.save_object( arch, "%s/meanX" % key, 40 pd.DataFrame(meanX, index = X.major_axis, 41 columns=X.minor_axis ) ) 42 archive.save_object( arch, "%s/sdX" % key, 43 pd.DataFrame(sdX, index = X.major_axis, 44 columns=X.minor_axis ) ) 45 46 if not (Y is None): 47 archive.save_object( arch, "%s/Y" % key, Y ) 48 49 if not (T is None): 50 archive.save_object( arch, "%s/T" % key, T )

51 52 @classmethod

53 - def _from_archive(cls, path, raw, *args):

54 ap, did = archive.split(path) 55 56 key = "%s/%s" % (did, (raw and "rawX" or "X")) 57 X = archive.load_object(ap, key) 58 59 def load_none(k, p=ap, did=did): 60 try: 61 return archive.load_object(p, "%s/%s" % (did, k)) 62 except Exception: 63 log.info("%s not found for this dataset" % k)

64 65 return cls(X, load_none("Y"), load_none("T"), *args)

66

67 -class aAnchorDataset(object):

68 """this dataset contains various tracks of epigenetic signal in the 69 for a set of genome sites (e.g., TSS-cenetered regions) all of the 70 same width. an instance maintains references to X (as a panel and ndarray), 71 Y, and T 72 """ 73

74 - def __init__(self, X, Y, T, batch_size, valid_s=None, valid_idx=None, rng=None):

75 self.pX = X 76 self.X = X.values 77 78 self.sY = Y 79 if not (Y is None): 80 self.Y = Y.values 81 82 self.dfT = T 83 if not (T is None): 84 self.T = T["label_code"].values 85 86 ## theano shared vars 87 self.__shX = None 88 self.__shY = None 89 self.__shT = None 90 91 self.batch_size = (batch_size or self.X.shape[0]) 92 93 nb = self.X.shape[0] / self.batch_size ## nr. of batches 94 self.train_batches, self.valid_batches = self.__batches(self.X.shape[0], 95 self.batch_size, 96 (valid_s or 0.25), 97 (valid_idx or nb - nb/4), 98 rng) 99 self.label_names = None 100 if not (self.dfT is None): 101 self.label_names = np.unique( self.dfT["label_name"].values ).tolist() 102 103 self.track_names = self.pX.major_axis.values.tolist()

104

105 - def __sh_anon(self, what, shape=None, borrow=True):

106 if getattr(self, what) is None: 107 raise ValueError("cannot share non-existent member %s" % what) 108 109 if getattr(self, "_AnchorDataset__sh%s" % what) is None: 110 init_val = getattr(self, what) 111 if shape: 112 init_val = init_val.reshape( shape ) 113 114 setattr(self, "_AnchorDataset__sh%s" % what, 115 theano.shared(init_val, borrow=borrow)) 116 return getattr(self, "_AnchorDataset__sh%s" % what)

117 118 @property

119 - def shX(self):

120 return self.__sh_anon("X")

121 122 @property

123 - def shY(self):

124 return self.__sh_anon("Y")

125 126 @property

127 - def shT(self):

128 return self.__sh_anon("T")

129 142 143 144

145 - def __iter_batches(self, which, nepochs):

146 """infinite loop over train/valid batches 147 148 @param nepochs: loop this many times over train batches (0 will loop forever) 149 @return: iterator """ 150 151 assert which in ("train_batches", "valid_batches") 152 153 batches = getattr(self, which) 154 epoch = 0 155 while True: 156 for i in batches: 157 yield i 158 epoch = epoch + 1 159 if epoch == nepochs: 160 break

161

162 - def iter_train(self, nepochs):

163 return self.__iter_batches("train_batches", nepochs)

164

165 - def iter_valid(self, nepochs):

166 return self.__iter_batches("valid_batches", nepochs)

167 168 @staticmethod

169 - def __batches(tot_size, batch_s, valid_s, valid_idx, rng):

170 """create train and validation batches from the given params. 171 172 the idea is to split the data into batches and allocate a 'valid_s' 173 portion of them for validation. the position of the (continuous) validation 174 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2, 175 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T 176 177 @param tot_size: nr. of examples 178 @param batch_s : batch size 179 @param valid_s : fraction of data to allocate for validation 180 @param valid_idx: batch index at which allocate validation data 181 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle) 182 @return : (train_batches, valid_batches)""" 183 184 if valid_s <= 0 or valid_s >= 1: 185 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s) 186 187 if batch_s > tot_size * min(valid_s, 1-valid_s): 188 raise ValueError("batch size (%d) too big > min(valid_s=%d, train_s=%d)"% (batch_s, 189 tot_size * valid_s, tot_size *(1-valid_s))) 190 191 all_batches = range( tot_size / batch_s ) 192 try: 193 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)] 194 except IndexError: 195 raise ValueError("valid_idx (%d) should be between 0 and %d", 196 valid_idx, len(all_batches)-1) 197 train_batches = list( set(all_batches) - set(valid_batches) ) 198 assert set(train_batches + valid_batches) == set(all_batches) 199 assert len( set(train_batches) & set(valid_batches) ) == 0 200 if not (rng is None): 201 rng.shuffle(train_batches) 202 rng.shuffle(valid_batches) 203 logging.info("train batches: %s", str(train_batches)) 204 logging.info("valid batches: %s", str(valid_batches)) 205 return (train_batches, valid_batches)

206 207 @property

208 - def labels(self):

209 return len(self.label_names)

210 211 @property

212 - def tracks(self):

213 return len(self.track_names)

214 215 @property

216 - def width(self):

217 return self.X.shape[2]

218 219 @staticmethod

220 - def normalize_features(x):

221 """transform each component of flattened X examples to 0 mean and 1 std 222 So the values of track t at position i are 0 mean and 1 std 223 224 x: a pandas data panel of the form <anchors> X <tracks> X <genome position> 225 return: (the shifted input, 226 the mean for each input component, the sd of each input component) 227 the latter 2 are arrays of shape(<tracks>, <genome position>) 228 """ 229 230 normX, m,v = ops.standardize(x.values, axis=0) 231 pX = pd.Panel( normX, items = x.items, major_axis = x.major_axis, 232 minor_axis = x.minor_axis) 233 return pX, m.reshape(x.shape[1], -1), v.reshape(x.shape[1], -1)

234 235 @staticmethod

236 - def fit_features(x):

237 """transform each **component** of X so that it fits on an interval [-1, 1]. 238 So the values of track t at position i are all in [-1,1] 239 240 @param x: a pandas data panel of the form <anchors> X <tracks> X <genome position> 241 @return: the scaled input 242 """ 243 244 fitX = ops.fit(x.values.reshape(x.shape[0], -1), axis=0) 245 246 return pd.Panel( fitX.reshape(x.shape), 247 items = x.items, major_axis = x.major_axis, 248 minor_axis = x.minor_axis)

249

250 251 252 253 #### TODO #### 254 -class Dataset(object):

255 - def __init__(self, X, Y, T):

256 self.X, self.Y, self.T = X, Y, T 257 258 if (not (Y is None)) and X.shape[0] != Y.shape[0]: 259 raise ValueError("|X| (%d) != |Y| (%d)" % (X.shape[0], 260 Y.shape[0])) 261 if (not (T is None)) and X.shape[0] != T.shape[0]: 262 raise ValueError("|X| (%d) != |T| (%d)" % (X.shape[0], 263 T.shape[0])) 264 log.info("allocated dataset. X of shape %s, Y %s, T %s", 265 str(self.X.shape), 266 (self.Y is None and "missing" or str(self.Y.shape)), 267 (self.T is None and "missing" or str(self.T.shape)))

268 269 @property

270 - def is_labeled(self):

271 return not (self.T is None)

272 273 @property

274 - def labels(self):

275 if self.is_labeled: 276 return np.unique( self.T ).shape[0] 277 else: 278 raise AttributeError("unlabeled dataset")

279 280 @staticmethod

281 - def normalize_features(x):

282 """transform each component of flattened X examples to 0 mean and 1 std 283 So the values of feature f (from all examples) are 0 mean and 1 std 284 285 x: a ndarray of shape (nr. examples, nr. of features) 286 return: (the shifted input, 287 the mean for each input component, the sd of each input component) 288 the latter 2 are arrays of shape(<tracks>, <genome position>) 289 """ 290 291 return ops.standardize(x, axis=0)

292 293 @staticmethod

294 - def fit_features(x):

295 """transform each **component** of X so that it fits on an interval [-1, 1]. 296 So the values of track t at position i are all in [-1,1] 297 298 @param x: a ndarray of shape (nr. examples, nr. features) 299 @return: the fitted input 300 """ 301 302 return ops.fit(x, axis=0)

303

304 -class TheanoShare( object ):

305 """a dataset that can return its data as theano shared variables""" 306

307 - def __init__(self):

308 self.__shX, self.__shY, self.__shT = None, None, None

309

310 - def __sh_anon(self, what, shape=None, borrow=True):

311 if getattr(self, what) is None: 312 raise ValueError("cannot share non-existent member %s" % what) 313 314 if getattr(self, "_TheanoShare__sh%s" % what) is None: 315 init_val = getattr(self, what) 316 if shape: 317 init_val = init_val.reshape( shape ) 318 319 setattr(self, "_TheanoShare__sh%s" % what, 320 theano.shared(init_val, borrow=borrow)) 321 return getattr(self, "_TheanoShare__sh%s" % what)

322 323 @property

324 - def shX(self):

325 return self.__sh_anon("X")

326 327 @property

328 - def shY(self):

329 return self.__sh_anon("Y")

330 331 @property

332 - def shT(self):

333 return self.__sh_anon("T")

334

347

348 -class TrainDataset( object ):

349 """a mixin for batch functionality, valid and train sub-dataset""" 350

351 - def __init__(self, batch_s, tot_s=None, valid_s=None, valid_idx=None, rng=None):

352 """Dataset that will 353 create train and validation batches from the given params. 354 355 the idea is to split the data into batches and allocate a 'valid_s' 356 portion of them for validation. the position of the (continuous) validation 357 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2, 358 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T 359 360 @param tot_s: nr. of examples 361 @param batch_s : batch size 362 @param valid_s : fraction of data to allocate for validation 363 @param valid_idx: batch index at which allocate validation data 364 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle) 365 @return : (train_batches, valid_batches)""" 366 367 if tot_s is None: 368 tot_s = self.X.shape[0] 369 if self.X.shape[0] < tot_s: 370 log.warning("total size (%d) > dataset size (%d). adjusting ...", 371 tot_s,self.X.shape[0]) 372 self.total_size = min( tot_s, self.X.shape[0] ) 373 374 self.batch_size = batch_s 375 376 if valid_s is None: 377 valid_s = 0.25 378 self.valid_size = valid_s 379 380 self.n_batches = self.X.shape[0] / self.batch_size ## nr. of batches 381 if valid_idx is None: 382 valid_idx = self.n_batches - int(self.n_batches * self.valid_size) 383 self.valid_idx = valid_idx 384 385 self.rng = rng 386 387 self.train_batches, self.valid_batches = self.__batches()

388

389 - def __batches(self):

390 tot_size = self.total_size 391 batch_s = self.batch_size 392 valid_s = self.valid_size 393 valid_idx = self.valid_idx 394 rng = self.rng 395 396 if valid_s <= 0 or valid_s >= 1: 397 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s) 398 399 if batch_s > tot_size * min(valid_s, 1-valid_s): 400 raise ValueError("batch_s (%d) > min(valid_s=%d, train_s=%d)" % (batch_s, 401 tot_size * valid_s, tot_size * (1-valid_s)) ) 402 403 all_batches = range( tot_size / batch_s ) 404 try: 405 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)] 406 except IndexError: 407 raise ValueError("valid_idx (%d) should be between 0 and %d", 408 valid_idx, len(all_batches)-1) 409 train_batches = list( set(all_batches) - set(valid_batches) ) 410 assert set(train_batches + valid_batches) == set(all_batches) 411 assert len( set(train_batches) & set(valid_batches) ) == 0 412 413 if not (rng is None): 414 rng.shuffle(train_batches) 415 rng.shuffle(valid_batches) 416 417 logging.info("train batches: %s", str(train_batches)) 418 logging.info("valid batches: %s", str(valid_batches)) 419 420 return (train_batches, valid_batches)

421 422

423 - def __iter_batches(self, which, nepochs):

424 """infinite loop over train/valid batches 425 426 @param nepochs: loop this many times over train batches (0 will loop forever) 427 @return: iterator """ 428 429 assert which in ("train_batches", "valid_batches") 430 431 batches = getattr(self, which) 432 epoch = 0 433 while True: 434 for i in batches: 435 yield i 436 epoch = epoch + 1 437 if epoch == nepochs: 438 break

439

440 - def iter_train(self, nepochs):

441 return self.__iter_batches("train_batches", nepochs)

442

443 - def iter_valid(self, nepochs):

444 return self.__iter_batches("valid_batches", nepochs)

445

446 -class AnchorAnnotation( object ):

447 - def __init__(self, anchors, tracks, width, labels):

448 self.pX = pd.Panel( self.X, items = anchors, major_axis = tracks, 449 minor_axis = width ) 450 if not (self.Y is None): 451 self.sY = pd.Series( self.Y, index=anchors ) 452 453 self.label_names = None 454 if not (self.T is None): 455 coden = map(lambda v: labels[v], self.T) 456 self.dfT = pd.DataFrame({"label_code" : self.T, 457 "label_name" : coden}) 458 self.label_names = labels

459

460 461 -class AnchorDataset(Dataset, AnchorAnnotation, TheanoShare):

462 - def __init__(self, X, Y, T):

463 """X, Y, T are a Panel, Series, and DataFrame resp.""" 464 465 valY = None 466 if not (Y is None): 467 valY = Y.values 468 valT = None 469 if not (T is None): 470 valT = T["label_code"].values 471 472 Dataset.__init__(self, X.values, valY, valT) 473 TheanoShare.__init__(self) 474 self.pX, self.sY, self.dfT = X, Y, T 475 476 self.label_names = None 477 if not (self.T is None): 478 self.label_names = np.unique( self.dfT["label_name"].values ).tolist()

479 480 @property

481 - def track_names(self):

482 return self.pX.major_axis.tolist()

483 484 @property

485 - def tracks(self):

486 return self.X.shape[1]

487 488 @property

489 - def width(self):

490 return self.X.shape[2]

491 492 @property

493 - def labels(self):

494 return len(self.label_names)

495

496 - def dump(self, path):

497 arch, key = archive.split(path) 498 499 X, Y, T = self.pX, self.sY, self.dfT 500 (nsamp, ntrack, width) = X.values.shape 501 502 if not (X is None): 503 archive.save_object( arch, "%s/rawX" % key, X ) 504 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1)) 505 506 archive.save_object( arch, "%s/X" % key, 507 pd.Panel(normX.reshape(nsamp, ntrack, width), 508 items = X.items, 509 major_axis = X.major_axis, 510 minor_axis = X.minor_axis) 511 ) 512 archive.save_object( arch, "%s/meanX" % key, 513 pd.DataFrame(meanX.reshape(ntrack, width), 514 index = X.major_axis, 515 columns=X.minor_axis ) ) 516 archive.save_object( arch, "%s/sdX" % key, 517 pd.DataFrame(sdX.reshape(ntrack, width), 518 index = X.major_axis, 519 columns=X.minor_axis ) ) 520 521 if not (Y is None): 522 archive.save_object( arch, "%s/Y" % key, Y ) 523 524 if not (T is None): 525 archive.save_object( arch, "%s/T" % key, T )

526 527 @classmethod

528 - def mean_sdX(cls, path):

529 ap, did = archive.split(path) 530 531 meanX = archive.load_object(ap, "/".join((did, "meanX"))) 532 sdX = archive.load_object(ap, "/".join((did, "sdX"))) 533 return meanX, sdX

534 535 @classmethod

536 - def _from_archive(cls, path, raw, *args):

537 ap, did = archive.split(path) 538 539 key = "%s/%s" % (did, (raw and "rawX" or "X")) 540 X = archive.load_object(ap, key) 541 542 def load_none(k, p=ap, did=did): 543 try: 544 return archive.load_object(p, "%s/%s" % (did, k)) 545 except Exception: 546 pass

547 Y, T = load_none("Y"), load_none("T") 548 return cls(X, Y, T, *args)

549

550 -class TrainAnchorDataset(AnchorDataset, TrainDataset):

551 - def __init__(self, X, Y, T, batch_s, 552 tot_s=None, valid_s=None, valid_idx=None, rng=None):

553 554 AnchorDataset.__init__(self, X, Y, T) 555 TrainDataset.__init__(self, batch_s, tot_s, valid_s, valid_idx, rng)

556 557 @classmethod

558 - def _from_archive(cls, path, raw, batch_s, **kwargs):

559 ap, did = archive.split(path) 560 561 key = "%s/%s" % (did, (raw and "rawX" or "X")) 562 X = archive.load_object(ap, key) 563 564 def load_none(k, p=ap, did=did): 565 try: 566 return archive.load_object(p, "%s/%s" % (did, k)) 567 except Exception: 568 pass

569 Y, T = load_none("Y"), load_none("T") 570 return cls(X, Y, T, batch_s, **kwargs)

571

Source Code for Module dimer.data