Source code for tethne.matrices.dfr

"""
Methods for generating Numpy data objects from JSTOR Data-for-Research datasets.

.. autosummary::

   array
   matrix

"""

import numpy as np

[docs]class Map(object):
    """
    Maps integer indices to string values.
    """

    def __init__(self):
        self.by_str = {}
        self.by_int = {}

    def __setitem__(self, key, value):
        if type(key) == str:
            self.by_str[key] = value
            self.by_int[value] = key
        if type(key) == int:
            self.by_int[key] = value
            self.by_str[value] = key

    def __getitem__(self, key):
        if type(key) == str:
            return self.by_str[key]
        if type(key) == int:
            return self.by_int[key]

    def __len__(self):
        return len(self.by_str)

[docs]def array(data, normalize=False, verbose=False):
    """
    Yields a Numpy array, along with feature-index and document-index mappings.

    **Usage**

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> import tethne.matrices as mt
       >>> data = rd.dfr.ngrams("/Path/to/DfR/data")
       >>> A, doc_index, feat_index = mt.dfr.array(data, normalize=True)

    Parameters
    ----------
    data : dict
        Keys are document identifiers (e.g. DOIs), values are lists of feature-
        frequency tuples.
    normalize : bool
        If True, matrix values are relative to the maximum value in the matrix.

    Returns
    -------
    A : Numpy array
        Columns are documents, rows are features.
    document_index : class:`.Map`
        Maps column indices to document identifiers (keys of provided data).
    feature_index : :class:`.Map`
        Maps row indices to features.

    """
    document_index, feature_index = _index_data(data, verbose)
    N_docs = len(document_index)
    N_feat = len(feature_index)
    max_v = np.max([ np.float(v[1]) for doc in data.keys() for v in data[doc] ])

    if verbose:
        print "array(): {0} documents, {1} features".format(N_docs, N_feat)

    if normalize:
        dtype = np.float
    else:
        dtype = np.int

    A = np.zeros((N_docs, N_feat), dtype=dtype)
    d = 0

    for doc, features in data.iteritems():
        if verbose:
            d += 1
            if d%200 == 0:
                print "array(): Processed {0} of {1} documents"\
                                                              .format(d, N_docs)

        i = document_index[doc]
        for f, v in features:
            j = feature_index[f]
            if normalize:
                v = v/max_v
            A[i, j] = v

    if verbose:
        print "array(): Processed all documents."

    return A, document_index, feature_index

[docs]def matrix(data, normalize=False, verbose=False):
    """
    Yields a Numpy matrix, along with feature-index and document-index mappings.

    **Usage**

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> import tethne.matrices as mt
       >>> data = rd.dfr.ngrams("/Path/to/DfR/data")
       >>> M, doc_index, feat_index = mt.dfr.matrix(data, normalize=True)

    Parameters
    ----------
    data : dict
        Keys are document identifiers (e.g. DOIs), values are lists of feature-
        frequency tuples.
    normalize : bool
        If True, matrix values are relative to the maximum value in the matrix.

    Returns
    -------
    M : Numpy matrix
        Columns are documents, rows are features.
    document_index : class:`.Map`
        Maps column indices to document identifiers (keys of provided data).
    feature_index : :class:`.Map`
        Maps row indices to features.

    """

    if verbose:
        print "matrix(): converting array to matrix."

    A, document_index, feature_index = array(data, normalize, verbose)
    M = np.asmatrix(A)

    if verbose:
        print "matrix(): done."

    return M, document_index, feature_index

def _index_data(data, verbose=False):
    """
    Yields document and feature indices from a data dict.

    Parameters
    ----------
    data : dict
        Keys are document identifiers (e.g. DOIs), values are lists of feature-
        frequency tuples.

    Returns
    -------
    document_index : class:`.Map`
        Maps integer indices to document identifiers (keys of provided data).
    feature_index : :class:`.Map`
        Maps integer row indices to features.
    """

    document_index = Map()
    feature_index = Map()
    N_docs = len(data)

    if verbose:
        print "_index_data(): Indexing {0} documents".format(N_docs)

    features = set()
    for i in xrange(N_docs): # Index documents.
        if verbose and i%200 == 0:
            print "_index_data(): Indexed {0} of {1} documents"\
                                                              .format(i, N_docs)

        document_index[i] = data.keys()[i]
        for f,c in data.values()[i]:
            features.add(f) # Build a set of features.

    N_feat = len(features)
    if verbose:
        print "_index_data(): Done indexing documents."
        print "_index_data(): Indexing {0} features.".format(N_feat)

    features = list(features)
    for i in xrange(N_feat):  # Index features.
        if verbose and i %10000 == 0:
            print "_index_data(): Indexed {0} of {1} features."\
                                                              .format(i, N_feat)

        feature_index[i] = features[i]

    if verbose:
        print "_index_data(): Done indexing features."

    return document_index, feature_index
Navigation

Source code for tethne.matrices.dfr

Quick search

Navigation