Source code for tethne.matrices.dfr
"""
Methods for generating Numpy data objects from JSTOR Data-for-Research datasets.
.. autosummary::
array
matrix
"""
import numpy as np
[docs]class Map(object):
"""
Maps integer indices to string values.
"""
def __init__(self):
self.by_str = {}
self.by_int = {}
def __setitem__(self, key, value):
if type(key) == str:
self.by_str[key] = value
self.by_int[value] = key
if type(key) == int:
self.by_int[key] = value
self.by_str[value] = key
def __getitem__(self, key):
if type(key) == str:
return self.by_str[key]
if type(key) == int:
return self.by_int[key]
def __len__(self):
return len(self.by_str)
[docs]def array(data, normalize=False, verbose=False):
"""
Yields a Numpy array, along with feature-index and document-index mappings.
**Usage**
.. code-block:: python
>>> import tethne.readers as rd
>>> import tethne.matrices as mt
>>> data = rd.dfr.ngrams("/Path/to/DfR/data")
>>> A, doc_index, feat_index = mt.dfr.array(data, normalize=True)
Parameters
----------
data : dict
Keys are document identifiers (e.g. DOIs), values are lists of feature-
frequency tuples.
normalize : bool
If True, matrix values are relative to the maximum value in the matrix.
Returns
-------
A : Numpy array
Columns are documents, rows are features.
document_index : class:`.Map`
Maps column indices to document identifiers (keys of provided data).
feature_index : :class:`.Map`
Maps row indices to features.
"""
document_index, feature_index = _index_data(data, verbose)
N_docs = len(document_index)
N_feat = len(feature_index)
max_v = np.max([ np.float(v[1]) for doc in data.keys() for v in data[doc] ])
if verbose:
print "array(): {0} documents, {1} features".format(N_docs, N_feat)
if normalize:
dtype = np.float
else:
dtype = np.int
A = np.zeros((N_docs, N_feat), dtype=dtype)
d = 0
for doc, features in data.iteritems():
if verbose:
d += 1
if d%200 == 0:
print "array(): Processed {0} of {1} documents"\
.format(d, N_docs)
i = document_index[doc]
for f, v in features:
j = feature_index[f]
if normalize:
v = v/max_v
A[i, j] = v
if verbose:
print "array(): Processed all documents."
return A, document_index, feature_index
[docs]def matrix(data, normalize=False, verbose=False):
"""
Yields a Numpy matrix, along with feature-index and document-index mappings.
**Usage**
.. code-block:: python
>>> import tethne.readers as rd
>>> import tethne.matrices as mt
>>> data = rd.dfr.ngrams("/Path/to/DfR/data")
>>> M, doc_index, feat_index = mt.dfr.matrix(data, normalize=True)
Parameters
----------
data : dict
Keys are document identifiers (e.g. DOIs), values are lists of feature-
frequency tuples.
normalize : bool
If True, matrix values are relative to the maximum value in the matrix.
Returns
-------
M : Numpy matrix
Columns are documents, rows are features.
document_index : class:`.Map`
Maps column indices to document identifiers (keys of provided data).
feature_index : :class:`.Map`
Maps row indices to features.
"""
if verbose:
print "matrix(): converting array to matrix."
A, document_index, feature_index = array(data, normalize, verbose)
M = np.asmatrix(A)
if verbose:
print "matrix(): done."
return M, document_index, feature_index
def _index_data(data, verbose=False):
"""
Yields document and feature indices from a data dict.
Parameters
----------
data : dict
Keys are document identifiers (e.g. DOIs), values are lists of feature-
frequency tuples.
Returns
-------
document_index : class:`.Map`
Maps integer indices to document identifiers (keys of provided data).
feature_index : :class:`.Map`
Maps integer row indices to features.
"""
document_index = Map()
feature_index = Map()
N_docs = len(data)
if verbose:
print "_index_data(): Indexing {0} documents".format(N_docs)
features = set()
for i in xrange(N_docs): # Index documents.
if verbose and i%200 == 0:
print "_index_data(): Indexed {0} of {1} documents"\
.format(i, N_docs)
document_index[i] = data.keys()[i]
for f,c in data.values()[i]:
features.add(f) # Build a set of features.
N_feat = len(features)
if verbose:
print "_index_data(): Done indexing documents."
print "_index_data(): Indexing {0} features.".format(N_feat)
features = list(features)
for i in xrange(N_feat): # Index features.
if verbose and i %10000 == 0:
print "_index_data(): Indexed {0} of {1} features."\
.format(i, N_feat)
feature_index[i] = features[i]
if verbose:
print "_index_data(): Done indexing features."
return document_index, feature_index