1 '''
2 support for dataset IO and manipulation
3
4 @author: odenas
5
6 '''
7 import logging
8
9 import numpy as np
10 import pandas as pd
11 import theano
12
13 import archive, ops
14
15 log = logging.getLogger(__name__)
20 self.archive, self.datapath = archive.split(path)
21 if not hasattr(self, "pX") or not hasattr(self, "X"):
22 raise ValueError("this mixin requires ")
23
24 - def dump(self, path):
25 arch = self.archive
26 key = self.datapath
27 X, Y, T = self.pX, self.sY, self.dfT
28 (nsamp, ntrack, width) = X.values.shape
29
30 if not (X is None):
31 archive.save_object( arch, "%s/rawX" % key, X )
32 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1))
33
34 archive.save_object( arch, "%s/X" % key,
35 pd.Panel(normX.reshape(nsamp, ntrack, width),
36 items = X.items,
37 major_axis = X.major_axis,
38 minor_axis = X.minor_axis) )
39 archive.save_object( arch, "%s/meanX" % key,
40 pd.DataFrame(meanX, index = X.major_axis,
41 columns=X.minor_axis ) )
42 archive.save_object( arch, "%s/sdX" % key,
43 pd.DataFrame(sdX, index = X.major_axis,
44 columns=X.minor_axis ) )
45
46 if not (Y is None):
47 archive.save_object( arch, "%s/Y" % key, Y )
48
49 if not (T is None):
50 archive.save_object( arch, "%s/T" % key, T )
51
52 @classmethod
54 ap, did = archive.split(path)
55
56 key = "%s/%s" % (did, (raw and "rawX" or "X"))
57 X = archive.load_object(ap, key)
58
59 def load_none(k, p=ap, did=did):
60 try:
61 return archive.load_object(p, "%s/%s" % (did, k))
62 except Exception:
63 log.info("%s not found for this dataset" % k)
64
65 return cls(X, load_none("Y"), load_none("T"), *args)
66
68 """this dataset contains various tracks of epigenetic signal in the
69 for a set of genome sites (e.g., TSS-cenetered regions) all of the
70 same width. an instance maintains references to X (as a panel and ndarray),
71 Y, and T
72 """
73
74 - def __init__(self, X, Y, T, batch_size, valid_s=None, valid_idx=None, rng=None):
75 self.pX = X
76 self.X = X.values
77
78 self.sY = Y
79 if not (Y is None):
80 self.Y = Y.values
81
82 self.dfT = T
83 if not (T is None):
84 self.T = T["label_code"].values
85
86
87 self.__shX = None
88 self.__shY = None
89 self.__shT = None
90
91 self.batch_size = (batch_size or self.X.shape[0])
92
93 nb = self.X.shape[0] / self.batch_size
94 self.train_batches, self.valid_batches = self.__batches(self.X.shape[0],
95 self.batch_size,
96 (valid_s or 0.25),
97 (valid_idx or nb - nb/4),
98 rng)
99 self.label_names = None
100 if not (self.dfT is None):
101 self.label_names = np.unique( self.dfT["label_name"].values ).tolist()
102
103 self.track_names = self.pX.major_axis.values.tolist()
104
105 - def __sh_anon(self, what, shape=None, borrow=True):
106 if getattr(self, what) is None:
107 raise ValueError("cannot share non-existent member %s" % what)
108
109 if getattr(self, "_AnchorDataset__sh%s" % what) is None:
110 init_val = getattr(self, what)
111 if shape:
112 init_val = init_val.reshape( shape )
113
114 setattr(self, "_AnchorDataset__sh%s" % what,
115 theano.shared(init_val, borrow=borrow))
116 return getattr(self, "_AnchorDataset__sh%s" % what)
117
118 @property
121
122 @property
125
126 @property
129
130 - def share(self, which, shape=None, borrow=True):
131 """wrap the data on a thean.shared variable
132
133 @param which: what component to wrap (str, typically 'X', 'T', 'Y')
134 @param shape: reshape the array to this shape
135 @param borrow: passer to theano.share
136 @return: theano.shared instance initialized to the required data"""
137
138 val = getattr(self, which)
139 if not (shape is None):
140 val = val.reshape( shape )
141 return theano.shared(val, borrow=borrow)
142
143
144
146 """infinite loop over train/valid batches
147
148 @param nepochs: loop this many times over train batches (0 will loop forever)
149 @return: iterator """
150
151 assert which in ("train_batches", "valid_batches")
152
153 batches = getattr(self, which)
154 epoch = 0
155 while True:
156 for i in batches:
157 yield i
158 epoch = epoch + 1
159 if epoch == nepochs:
160 break
161
164
167
168 @staticmethod
169 - def __batches(tot_size, batch_s, valid_s, valid_idx, rng):
170 """create train and validation batches from the given params.
171
172 the idea is to split the data into batches and allocate a 'valid_s'
173 portion of them for validation. the position of the (continuous) validation
174 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2,
175 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T
176
177 @param tot_size: nr. of examples
178 @param batch_s : batch size
179 @param valid_s : fraction of data to allocate for validation
180 @param valid_idx: batch index at which allocate validation data
181 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle)
182 @return : (train_batches, valid_batches)"""
183
184 if valid_s <= 0 or valid_s >= 1:
185 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s)
186
187 if batch_s > tot_size * min(valid_s, 1-valid_s):
188 raise ValueError("batch size (%d) too big > min(valid_s=%d, train_s=%d)"% (batch_s,
189 tot_size * valid_s, tot_size *(1-valid_s)))
190
191 all_batches = range( tot_size / batch_s )
192 try:
193 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)]
194 except IndexError:
195 raise ValueError("valid_idx (%d) should be between 0 and %d",
196 valid_idx, len(all_batches)-1)
197 train_batches = list( set(all_batches) - set(valid_batches) )
198 assert set(train_batches + valid_batches) == set(all_batches)
199 assert len( set(train_batches) & set(valid_batches) ) == 0
200 if not (rng is None):
201 rng.shuffle(train_batches)
202 rng.shuffle(valid_batches)
203 logging.info("train batches: %s", str(train_batches))
204 logging.info("valid batches: %s", str(valid_batches))
205 return (train_batches, valid_batches)
206
207 @property
209 return len(self.label_names)
210
211 @property
214
215 @property
217 return self.X.shape[2]
218
219 @staticmethod
221 """transform each component of flattened X examples to 0 mean and 1 std
222 So the values of track t at position i are 0 mean and 1 std
223
224 x: a pandas data panel of the form <anchors> X <tracks> X <genome position>
225 return: (the shifted input,
226 the mean for each input component, the sd of each input component)
227 the latter 2 are arrays of shape(<tracks>, <genome position>)
228 """
229
230 normX, m,v = ops.standardize(x.values, axis=0)
231 pX = pd.Panel( normX, items = x.items, major_axis = x.major_axis,
232 minor_axis = x.minor_axis)
233 return pX, m.reshape(x.shape[1], -1), v.reshape(x.shape[1], -1)
234
235 @staticmethod
237 """transform each **component** of X so that it fits on an interval [-1, 1].
238 So the values of track t at position i are all in [-1,1]
239
240 @param x: a pandas data panel of the form <anchors> X <tracks> X <genome position>
241 @return: the scaled input
242 """
243
244 fitX = ops.fit(x.values.reshape(x.shape[0], -1), axis=0)
245
246 return pd.Panel( fitX.reshape(x.shape),
247 items = x.items, major_axis = x.major_axis,
248 minor_axis = x.minor_axis)
249
250
251
252
253
254 -class Dataset(object):
256 self.X, self.Y, self.T = X, Y, T
257
258 if (not (Y is None)) and X.shape[0] != Y.shape[0]:
259 raise ValueError("|X| (%d) != |Y| (%d)" % (X.shape[0],
260 Y.shape[0]))
261 if (not (T is None)) and X.shape[0] != T.shape[0]:
262 raise ValueError("|X| (%d) != |T| (%d)" % (X.shape[0],
263 T.shape[0]))
264 log.info("allocated dataset. X of shape %s, Y %s, T %s",
265 str(self.X.shape),
266 (self.Y is None and "missing" or str(self.Y.shape)),
267 (self.T is None and "missing" or str(self.T.shape)))
268
269 @property
271 return not (self.T is None)
272
273 @property
275 if self.is_labeled:
276 return np.unique( self.T ).shape[0]
277 else:
278 raise AttributeError("unlabeled dataset")
279
280 @staticmethod
282 """transform each component of flattened X examples to 0 mean and 1 std
283 So the values of feature f (from all examples) are 0 mean and 1 std
284
285 x: a ndarray of shape (nr. examples, nr. of features)
286 return: (the shifted input,
287 the mean for each input component, the sd of each input component)
288 the latter 2 are arrays of shape(<tracks>, <genome position>)
289 """
290
291 return ops.standardize(x, axis=0)
292
293 @staticmethod
295 """transform each **component** of X so that it fits on an interval [-1, 1].
296 So the values of track t at position i are all in [-1,1]
297
298 @param x: a ndarray of shape (nr. examples, nr. features)
299 @return: the fitted input
300 """
301
302 return ops.fit(x, axis=0)
303
305 """a dataset that can return its data as theano shared variables"""
306
308 self.__shX, self.__shY, self.__shT = None, None, None
309
310 - def __sh_anon(self, what, shape=None, borrow=True):
311 if getattr(self, what) is None:
312 raise ValueError("cannot share non-existent member %s" % what)
313
314 if getattr(self, "_TheanoShare__sh%s" % what) is None:
315 init_val = getattr(self, what)
316 if shape:
317 init_val = init_val.reshape( shape )
318
319 setattr(self, "_TheanoShare__sh%s" % what,
320 theano.shared(init_val, borrow=borrow))
321 return getattr(self, "_TheanoShare__sh%s" % what)
322
323 @property
326
327 @property
330
331 @property
334
335 - def share(self, which, shape=None, borrow=True):
336 """wrap the data on a thean.shared variable
337
338 @param which: what component to wrap (str, typically 'X', 'T', 'Y')
339 @param shape: reshape the array to this shape
340 @param borrow: passer to theano.share
341 @return: theano.shared instance initialized to the required data"""
342
343 val = getattr(self, which)
344 if not (shape is None):
345 val = val.reshape( shape )
346 return theano.shared(val, borrow=borrow)
347
349 """a mixin for batch functionality, valid and train sub-dataset"""
350
351 - def __init__(self, batch_s, tot_s=None, valid_s=None, valid_idx=None, rng=None):
352 """Dataset that will
353 create train and validation batches from the given params.
354
355 the idea is to split the data into batches and allocate a 'valid_s'
356 portion of them for validation. the position of the (continuous) validation
357 block is w.r.t batches. E.g., for tot_size = 10, batch_s = 2,
358 valid_idx=3, valid_s = 0.3 you get 4 + 1 train + valid batches: T T T V T
359
360 @param tot_s: nr. of examples
361 @param batch_s : batch size
362 @param valid_s : fraction of data to allocate for validation
363 @param valid_idx: batch index at which allocate validation data
364 @param rng : numpy.RandomState used to shuffle batches or None (no shuffle)
365 @return : (train_batches, valid_batches)"""
366
367 if tot_s is None:
368 tot_s = self.X.shape[0]
369 if self.X.shape[0] < tot_s:
370 log.warning("total size (%d) > dataset size (%d). adjusting ...",
371 tot_s,self.X.shape[0])
372 self.total_size = min( tot_s, self.X.shape[0] )
373
374 self.batch_size = batch_s
375
376 if valid_s is None:
377 valid_s = 0.25
378 self.valid_size = valid_s
379
380 self.n_batches = self.X.shape[0] / self.batch_size
381 if valid_idx is None:
382 valid_idx = self.n_batches - int(self.n_batches * self.valid_size)
383 self.valid_idx = valid_idx
384
385 self.rng = rng
386
387 self.train_batches, self.valid_batches = self.__batches()
388
390 tot_size = self.total_size
391 batch_s = self.batch_size
392 valid_s = self.valid_size
393 valid_idx = self.valid_idx
394 rng = self.rng
395
396 if valid_s <= 0 or valid_s >= 1:
397 raise ValueError("valid_s (%f) should be between (0, 1) ", valid_s)
398
399 if batch_s > tot_size * min(valid_s, 1-valid_s):
400 raise ValueError("batch_s (%d) > min(valid_s=%d, train_s=%d)" % (batch_s,
401 tot_size * valid_s, tot_size * (1-valid_s)) )
402
403 all_batches = range( tot_size / batch_s )
404 try:
405 valid_batches = all_batches[valid_idx:valid_idx+int(len(all_batches)*valid_s)]
406 except IndexError:
407 raise ValueError("valid_idx (%d) should be between 0 and %d",
408 valid_idx, len(all_batches)-1)
409 train_batches = list( set(all_batches) - set(valid_batches) )
410 assert set(train_batches + valid_batches) == set(all_batches)
411 assert len( set(train_batches) & set(valid_batches) ) == 0
412
413 if not (rng is None):
414 rng.shuffle(train_batches)
415 rng.shuffle(valid_batches)
416
417 logging.info("train batches: %s", str(train_batches))
418 logging.info("valid batches: %s", str(valid_batches))
419
420 return (train_batches, valid_batches)
421
422
424 """infinite loop over train/valid batches
425
426 @param nepochs: loop this many times over train batches (0 will loop forever)
427 @return: iterator """
428
429 assert which in ("train_batches", "valid_batches")
430
431 batches = getattr(self, which)
432 epoch = 0
433 while True:
434 for i in batches:
435 yield i
436 epoch = epoch + 1
437 if epoch == nepochs:
438 break
439
442
445
447 - def __init__(self, anchors, tracks, width, labels):
448 self.pX = pd.Panel( self.X, items = anchors, major_axis = tracks,
449 minor_axis = width )
450 if not (self.Y is None):
451 self.sY = pd.Series( self.Y, index=anchors )
452
453 self.label_names = None
454 if not (self.T is None):
455 coden = map(lambda v: labels[v], self.T)
456 self.dfT = pd.DataFrame({"label_code" : self.T,
457 "label_name" : coden})
458 self.label_names = labels
459
463 """X, Y, T are a Panel, Series, and DataFrame resp."""
464
465 valY = None
466 if not (Y is None):
467 valY = Y.values
468 valT = None
469 if not (T is None):
470 valT = T["label_code"].values
471
472 Dataset.__init__(self, X.values, valY, valT)
473 TheanoShare.__init__(self)
474 self.pX, self.sY, self.dfT = X, Y, T
475
476 self.label_names = None
477 if not (self.T is None):
478 self.label_names = np.unique( self.dfT["label_name"].values ).tolist()
479
480 @property
482 return self.pX.major_axis.tolist()
483
484 @property
486 return self.X.shape[1]
487
488 @property
490 return self.X.shape[2]
491
492 @property
494 return len(self.label_names)
495
496 - def dump(self, path):
497 arch, key = archive.split(path)
498
499 X, Y, T = self.pX, self.sY, self.dfT
500 (nsamp, ntrack, width) = X.values.shape
501
502 if not (X is None):
503 archive.save_object( arch, "%s/rawX" % key, X )
504 normX, meanX, sdX = self.normalize_features(X.values.reshape(nsamp, -1))
505
506 archive.save_object( arch, "%s/X" % key,
507 pd.Panel(normX.reshape(nsamp, ntrack, width),
508 items = X.items,
509 major_axis = X.major_axis,
510 minor_axis = X.minor_axis)
511 )
512 archive.save_object( arch, "%s/meanX" % key,
513 pd.DataFrame(meanX.reshape(ntrack, width),
514 index = X.major_axis,
515 columns=X.minor_axis ) )
516 archive.save_object( arch, "%s/sdX" % key,
517 pd.DataFrame(sdX.reshape(ntrack, width),
518 index = X.major_axis,
519 columns=X.minor_axis ) )
520
521 if not (Y is None):
522 archive.save_object( arch, "%s/Y" % key, Y )
523
524 if not (T is None):
525 archive.save_object( arch, "%s/T" % key, T )
526
527 @classmethod
534
535 @classmethod
537 ap, did = archive.split(path)
538
539 key = "%s/%s" % (did, (raw and "rawX" or "X"))
540 X = archive.load_object(ap, key)
541
542 def load_none(k, p=ap, did=did):
543 try:
544 return archive.load_object(p, "%s/%s" % (did, k))
545 except Exception:
546 pass
547 Y, T = load_none("Y"), load_none("T")
548 return cls(X, Y, T, *args)
549
551 - def __init__(self, X, Y, T, batch_s,
552 tot_s=None, valid_s=None, valid_idx=None, rng=None):
556
557 @classmethod
559 ap, did = archive.split(path)
560
561 key = "%s/%s" % (did, (raw and "rawX" or "X"))
562 X = archive.load_object(ap, key)
563
564 def load_none(k, p=ap, did=did):
565 try:
566 return archive.load_object(p, "%s/%s" % (did, k))
567 except Exception:
568 pass
569 Y, T = load_none("Y"), load_none("T")
570 return cls(X, Y, T, batch_s, **kwargs)
571