"""
Common functions/classes for dataprep.
"""
import numpy as np
import cPickle
import itertools
import os
import sys
import functools
from collections import defaultdict
###############################################################################
# Decorators
###############################################################################
[docs]def lazyprop(fn):
"""
Use as a decorator to get lazily evaluated properties.
"""
attr_name = '_lazy_' + fn.__name__
@property
def _lazyprop(self):
if not hasattr(self, attr_name):
setattr(self, attr_name, fn(self))
return getattr(self, attr_name)
return _lazyprop
###############################################################################
# Wrappers for opening/closing files
###############################################################################
[docs]class smart_open(object):
"""Context manager that opens a filename and closes it on exit, but does
nothing for file-like objects.
"""
def __init__(self, filename, *args):
"""
The exact same call structure as the built-in function 'open'
Parmeters
---------
filename : filepath, buffer, or StringIO
args : Optional args
First arg will be 'mode', e.g. 'r', 'rb', 'w'
Second arg will be 'buffering', read the docs for open
"""
if isinstance(filename, basestring):
self.fh = open(filename, *args)
self.closing = True
else:
self.fh = filename
self.closing = False
def __enter__(self):
return self.fh
def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
self.fh.close()
return False
###############################################################################
# Functions to read special file formats
###############################################################################
[docs]def get_list_from_filerows(infile):
"""
Returns a list generated from rows of a file.
Parameters
----------
infile : File buffer or path
Lines starting with # are comments
Blank lines and leading/trailing whitespace are ignored
Other lines will be converted to a string and appended to a
list.
"""
with smart_open(infile, 'rb') as f:
kpv_list = []
for line in f:
# Strip whitespace
line = line.strip()
# Skip empty lines
if len(line) > 0:
# If the line isn't a comment
# Append the content to the list
if line[0] != '#':
kpv_list.append(line.rstrip('\n'))
return kpv_list
[docs]def write_list_to_filerows(outfile, mylist):
"""
The inverse of get_list_from_filerows.
Parameters
----------
outfile : filepath or buffer
mylist : List
"""
with smart_open(outfile, 'wb') as f:
for item in mylist:
f.write(str(item) + '\n')
[docs]def pickleme(obj, pkl_file, protocol=2):
"""
Save obj to disk using cPickle.
Parameters
----------
obj : Serializable Python object
pkl_file : filepath or buffer
File to store obj to
protocol : 0, 1, or 2
2 is fastest
"""
with smart_open(pkl_file, 'w') as f:
cPickle.dump(obj, f, protocol=protocol)
[docs]def unpickleme(pkl_file):
"""
Returns unpickled version of object.
Parameters
----------
pkl_file : filepath or buffer
We will attempt to unpickle this file.
"""
with smart_open(pkl_file, 'r') as f:
return cPickle.load(f)
[docs]def get_structured_array(listoflists, schema, dropmissing=False):
"""
Uses schema to convert listoflists to a structured array.
Parameters
----------
listoflists : List of lists
schema : List of tuples
E.g. [(var1, type1),...,(varK, typeK)]
dropmissing : Boolean
If True, drop rows that contain missing values
"""
## First convert listoflists to a list of tuples...
# TODO : This CAN'T actually be necessary..find another way
if dropmissing:
tuple_list = [tuple(row) for row in listoflists if '' not in row]
else:
tuple_list = [tuple(row) for row in listoflists]
return np.array(tuple_list, schema)
###############################################################################
# Custom Exceptions
###############################################################################
[docs]class BadDataError(Exception):
"""
Dummy class that is exactly like the Exception class. Used to make sure
people are raising the intended exception, rather than some other wierd
one.
"""
pass
[docs]class TokenError(Exception):
"""
Raise when tokens are passed to a method/function and you don't know how
to deal with them.
"""
pass
[docs]class ConfigurationSyntaxError(Exception):
"""
Dummy class that is exactly like the Exception class.
Used to deal with syntax issues config files.
"""
pass
class DocIDError(Exception):
pass
###############################################################################
# Functions for printing objects
###############################################################################
def printdict(d, max_print_len=None):
s = ''
for key, value in d.iteritems():
s += str(key) + ': ' + str(value) + '\n'
if max_print_len:
print s[:max_print_len]
else:
print s
###############################################################################
# Custom data structures
###############################################################################
[docs]def nested_defaultdict(default_factory, levels=1):
"""
Creates nested defaultdicts with the lowest level having default factory.
Parameters
----------
default_factory : Callable
Called without arguments to produce a new value when a key is not
present.
levels : Positive Integer
The number of nesting levels to use. If levels == 1, this is just
an ordinary defaultdict.
Examples
--------
>>> mydict = nested_defaultdict(int, levels=2)
>>> mydict['columbia']['undergrads'] += 1
"""
if not isinstance(levels, int) or (levels < 1):
raise ValueError("levels =%s, should be a postitive integer" % levels)
def nestone():
"""Used in place of a lambda to allow pickling"""
return nested_defaultdict(default_factory, levels - 1)
if levels == 1:
return defaultdict(default_factory)
else:
return defaultdict(nestone)
[docs]def nested_keysearch(ndict, key_list):
"""
Returns True if ndict[key_list[0]][key_list[1]]...[key_list[-1]] exists.
Parameters
----------
ndict : Nested dictionary
E.g. {'a': {'b': 2}}
key_list : List of strings
"""
if isinstance(key_list, basestring):
key_list = [key_list]
first_key = key_list[0]
if len(key_list) == 1:
return first_key in ndict
else:
if first_key in ndict:
return nested_keysearch(ndict[first_key], key_list[1: ])
###############################################################################
# String type operations
###############################################################################
###############################################################################
# Functional
###############################################################################
[docs]def grouper(iterable, chunksize, fillvalue=None):
"""
Group iterable into chunks of length n, with fillvalue for the (possibly)
smaller last chunk.
grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
Parameters
----------
iterable : Iterable
chunksie : Integer
fillvalue : Anything
Fill missing values with this
"""
args = [iter(iterable)] * chunksize
return itertools.izip_longest(fillvalue=fillvalue, *args)
def compose(*functions):
def compose2(f, g):
def f_g(x):
return f(g(x))
return f_g
return functools.reduce(compose2, functions)