Source code for rosetta.parallel.pandas_easy

"""
Functions for helping make pandas parallel.
"""
from functools import partial

import numpy as np
import pandas as pd

from .parallel_easy import map_easy


###############################################################################
# Globals
###############################################################################

###############################################################################
# Functions
###############################################################################


[docs]def groupby_to_scalar_to_series(df_or_series, func, n_jobs, **groupby_kwargs): """ Returns a parallelized, simplified, and restricted version of: df_or_series.groupby(**groupby_kwargs).apply(func) Works ONLY for the simple case that .apply(func) would yield a Series of length equal to the number of groups, in other words, func applied to each group is a scalar. Parameters ---------- df_or_series : DataFrame or Series This is what is grouped func : Function Applied to each group using func(df_or_series) Should return one single value (e.g. string or number) Must be picklable: A lambda function will not work! groupby_kwargs : Keyword args Passed directly to DataFrame.groupby to determine groups. The most common one is "by", e.g. by='a' by=my_grouper_function by=my_grouping_list_of_labels Returns ------- result : Series Index is the group names Values are func(group) iterated over every group Examples -------- >>> from rosetta.parallel.pandas_easy import groupby_to_series >>> df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]}) >>> df a b 0 6 4 1 2 5 2 2 6 >>> groupby_to_series(df, max, n_jobs, by='a') 2 b 6 b >>> s = pd.Series([1, 2, 3, 4]) >>> s 0 1 1 2 2 3 3 4 >>> labels = ['a', 'a', 'b', 'b'] >>> groupby_to_series(s, max, 1, by=labels) a 2 b 4 """ grouped = df_or_series.groupby(**groupby_kwargs) apply_func = partial(_get_label_values, func, False) labels_values = map_easy(apply_func, grouped, n_jobs) labels, values = zip(*labels_values) return pd.Series(values, index=labels)
[docs]def groupby_to_series_to_frame( frame, func, n_jobs, use_apply=True, **groupby_kwargs): """ A parallel function somewhat similar DataFrame.groupby.apply(func). For each group in df_or_series.groupby(**groupby_kwargs), compute func(group) or group.apply(func) and, assuming each result is a series, flatten each series then paste them together. Parameters ---------- frame : DataFrame func : Function Applied to each group using func(df_or_series) Must be picklable: A lambda function will not work! use_apply : Boolean If True, use group.apply(func) If False, use func(group) groupby_kwargs : Keyword args Passed directly to DataFrame.groupby to determine groups. The most common one is "by", e.g. by='a' by=my_grouper_function by=my_grouping_list_of_labels Returns ------- result : DataFrame Index is the group names Values are func(group) iterated over every group, then pasted together Examples -------- >>> from rosetta.parallel.pandas_easy import groupby_to_series_to_frame >>> df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]}) >>> labels = ['g1', 'g1', 'g2'] # Result and benchmark will be equal...despite the fact that you can't # do df.groupby(labels).apply(np.mean) >>> benchmark = df.groupby(labels).mean() >>> result = groupby_to_series_to_frame( ... df, np.mean, 1, use_apply=True, by=labels) >>> print result a b g1 4 4.5 g2 2 6.0 """ grouped = frame.groupby(**groupby_kwargs) apply_func = partial(_get_label_values, func, use_apply) # For every group, get the label (group name) and the values # (output of apply_func) labels_values = map_easy(apply_func, grouped, n_jobs) labels, values = zip(*labels_values) # Since each value is a series, concat along axis 1 to make a short # and fat frame, then take transpose concatted = pd.concat(values, axis=1).T # Set the index if hasattr(groupby_kwargs['by'], 'name'): indexname = groupby_kwargs['by'].name elif isinstance(groupby_kwargs['by'], basestring): indexname = groupby_kwargs['by'] else: indexname = None concatted.index = pd.Index(labels, name=indexname) return concatted
def _get_label_values(func, use_apply, name_and_group): """ Returns a tuple of a name, func(group) for this name_and_group. Used since .groupby() returns an iterator over the pairs (name, group). Parameters ---------- func : Function Must be picklable: A lambda function will not work! name_and_group : Tuple name, group use_apply : Boolean If True, use group.apply(func) If False, use func(group) Returns ------- name : the group name/label Same as the 'name' passed in value : Either group.apply(func) or func(group) """ name, group = name_and_group value = group.apply(func) if use_apply else func(group) return name, value if __name__ == '__main__': # Can't get doctest to work with multiprocessing... pass