Source code for rosetta.parallel.pandas_easy

"""
Functions for helping make pandas parallel.
"""
from functools import partial

import numpy as np
import pandas as pd

from .parallel_easy import map_easy


###############################################################################
# Globals
###############################################################################

###############################################################################
# Functions
###############################################################################


[docs]def groupby_to_scalar_to_series(df_or_series, func, n_jobs, **groupby_kwargs):
    """
    Returns a parallelized, simplified, and restricted version of:
    df_or_series.groupby(**groupby_kwargs).apply(func)

    Works ONLY for the simple case that .apply(func) would yield a Series
    of length equal to the number of groups, in other words, func applied
    to each group is a scalar.

    Parameters
    ----------
    df_or_series : DataFrame or Series
        This is what is grouped
    func : Function
        Applied to each group using func(df_or_series)
        Should return one single value (e.g. string or number)
        Must be picklable:  A lambda function will not work!
    groupby_kwargs : Keyword args
        Passed directly to DataFrame.groupby to determine groups.
        The most common one is "by", e.g.
            by='a'
            by=my_grouper_function
            by=my_grouping_list_of_labels

    Returns
    -------
    result : Series
        Index is the group names
        Values are func(group) iterated over every group

    Examples
    --------
    >>> from rosetta.parallel.pandas_easy import groupby_to_series
    >>> df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]})
    >>> df
       a  b
    0  6  4
    1  2  5
    2  2  6
    >>> groupby_to_series(df, max, n_jobs, by='a')
    2    b
    6    b

    >>> s = pd.Series([1, 2, 3, 4])
    >>> s
    0    1
    1    2
    2    3
    3    4
    >>> labels = ['a', 'a', 'b', 'b']
    >>> groupby_to_series(s, max, 1, by=labels)
    a    2
    b    4
    """
    grouped = df_or_series.groupby(**groupby_kwargs)
    apply_func = partial(_get_label_values, func, False)

    labels_values = map_easy(apply_func, grouped, n_jobs)
    labels, values = zip(*labels_values)

    return pd.Series(values, index=labels)


[docs]def groupby_to_series_to_frame(
    frame, func, n_jobs, use_apply=True, **groupby_kwargs):
    """
    A parallel function somewhat similar DataFrame.groupby.apply(func).

    For each group in df_or_series.groupby(**groupby_kwargs), compute
    func(group) or group.apply(func) and, assuming each result is a series,
    flatten each series then paste them together.

    Parameters
    ----------
    frame : DataFrame
    func : Function
        Applied to each group using func(df_or_series)
        Must be picklable:  A lambda function will not work!
    use_apply : Boolean
        If True, use group.apply(func)
        If False, use func(group)
    groupby_kwargs : Keyword args
        Passed directly to DataFrame.groupby to determine groups.
        The most common one is "by", e.g.
            by='a'
            by=my_grouper_function
            by=my_grouping_list_of_labels

    Returns
    -------
    result : DataFrame
        Index is the group names
        Values are func(group) iterated over every group, then pasted together

    Examples
    --------
    >>> from rosetta.parallel.pandas_easy import groupby_to_series_to_frame
    >>> df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]})
    >>> labels = ['g1', 'g1', 'g2']
    # Result and benchmark will be equal...despite the fact that you can't
    # do df.groupby(labels).apply(np.mean)
    >>> benchmark = df.groupby(labels).mean()
    >>> result = groupby_to_series_to_frame(
    ...    df, np.mean, 1, use_apply=True, by=labels)
    >>> print result
        a    b
    g1  4  4.5
    g2  2  6.0
    """
    grouped = frame.groupby(**groupby_kwargs)
    apply_func = partial(_get_label_values, func, use_apply)

    # For every group, get the label (group name) and the values
    # (output of apply_func)
    labels_values = map_easy(apply_func, grouped, n_jobs)
    labels, values = zip(*labels_values)

    # Since each value is a series, concat along axis 1 to make a short
    # and fat frame, then take transpose
    concatted = pd.concat(values, axis=1).T

    # Set the index
    if hasattr(groupby_kwargs['by'], 'name'):
        indexname = groupby_kwargs['by'].name
    elif isinstance(groupby_kwargs['by'], basestring):
        indexname = groupby_kwargs['by']
    else:
        indexname = None
    concatted.index = pd.Index(labels, name=indexname)

    return concatted


def _get_label_values(func, use_apply, name_and_group):
    """
    Returns a tuple of a name, func(group) for this name_and_group.
    Used since .groupby() returns an iterator over the pairs (name, group).

    Parameters
    ----------
    func : Function
        Must be picklable:  A lambda function will not work!
    name_and_group : Tuple
        name, group
    use_apply : Boolean
        If True, use group.apply(func)
        If False, use func(group)

    Returns
    -------
    name : the group name/label
        Same as the 'name' passed in
    value : Either group.apply(func) or func(group)
    """
    name, group = name_and_group

    value = group.apply(func) if use_apply else func(group)

    return name, value


if __name__ == '__main__':
    # Can't get doctest to work with multiprocessing...
    pass