Source code for tethne.readers.dfr

"""
Methods for parsing JSTOR Data-for-Research datasets.

.. autosummary::

   ngrams
   read

"""

import tethne.data as dt
import os
import xml.etree.ElementTree as ET
import re
from tethne.utilities import dict_from_node, strip_non_ascii
from nltk.corpus import stopwords
import uuid

[docs]def read(datapath):
    """
    Yields :class:`.Paper` s from JSTOR DfR package.

    Each :class:`.Paper` is tagged with an accession id for this
    read/conversion.

    Parameters
    ----------
    filepath : string
        Filepath to unzipped JSTOR DfR folder containing a citations.XML file.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.dfr.read("/Path/to/DfR")
    """

    try:
        with open(datapath + "/citations.XML", 'rb') as f:
            data = f.read().replace('&', '&amp;')
            root = ET.fromstring(data)
    except IOError:
        raise IOError(datapath+"citations.XML not found.")

    accession = str(uuid.uuid4())

    papers = []
    for article in root:
        paper = _handle_paper(article)
        paper['accession'] = accession
        papers.append(paper)

    return papers

[docs]def from_dir(path):
    """
    Convenience function for generating a list of :class:`.Paper` from a
    directory of JSTOR DfR datasets.

    Parameters
    ----------
    path : string
        Path to directory containing DfR dataset directories.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.

    Raises
    ------
    IOError
        Invalid path.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.dfr.from_dir("/Path/to/datadir")

    """

    papers = []

    try:
        files = os.listdir(path)
    except IOError:
        raise IOError("Invalid path.")  # Ignore hidden files.

    for f in files:
        if not f.startswith('.') and os.path.isdir(path + "/" + f):
            try:
                papers += read(path + "/" + f)
            except (IOError, UnboundLocalError):    # Ignore directories that
                pass                                #  don't contain DfR data.

    return papers

[docs]def ngrams(datapath, N='bi', ignore_hash=True, apply_stoplist=False):
    """
    Yields N-grams from a JSTOR DfR dataset.

    Parameters
    ----------
    filepath : string
        Filepath to unzipped JSTOR DfR folder containing N-grams (e.g.
        'bigrams').
    N : string
        'bi', 'tri', or 'quad'
    ignore_hash : bool
        If True, will exclude all N-grams that contain the hash '#' character.
    apply_stoplist : bool
        If True, will exclude all N-grams that contain words in the NLTK
        stoplist.

    Returns
    -------
    ngrams : dict
        Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> trigrams = rd.dfr.ngrams("/Path/to/DfR", N='tri')
    """

    gram_path = datapath + "/" + N + "grams"
    ngrams = {}

    for file in os.listdir(gram_path):
        if file.split('.')[-1] == 'XML':
            root = ET.parse(gram_path + "/" + file).getroot()
            doi = root.attrib['id']
            grams = [ (gram.text.strip(), int(gram.attrib['weight']) )
                       for gram in root.findall(N + 'gram') # v Hashes v
                       if not ignore_hash or '#' not in list(gram.text) ]

            if apply_stoplist:
                stoplist = stoplist.words()
                grams_ = []
                for g,c in grams:
                    for w in g.split():
                        if w not in stoplist:
                            grams_.append( (g,c) )
                grams = grams_

            ngrams[doi] = grams

    return ngrams

def _handle_paper(article):
    """
    Yields a :class:`.Paper` from an article ET node.

    Parameters
    ----------
    article : Element
        ElementTree Element 'article'.

    Returns
    -------
    paper : :class:`.Paper`
    """
    paper = dt.Paper()
    pdata = dict_from_node(article)

    # Direct mappings.
    translator = _dfr2paper_map()
    for key, value in translator.iteritems():
        try:
            paper[value] = str(pdata[key]).upper()
        except KeyError:    # Article may not have all keys of interest.
            pass

    # Handle author names.
    paper['aulast'], paper['auinit'] = _handle_authors(pdata['author'])

    # Handle pubdate.
    paper['date'] = _handle_pubdate(pdata['pubdate'])

    # Handle pagerange.
    paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange'])

    # Generate ayjid.
    try:
        paper['ayjid'] = _create_ayjid(paper['aulast'][0], paper['auinit'][0], \
                                       paper['date'], paper['jtitle'])
    except IndexError:  # Article may not have authors.
        pass

    return paper

def _handle_pagerange(pagerange):
    """
    Yields start and end pages from DfR pagerange field.

    Parameters
    ----------
    pagerange : str or unicode
        DfR-style pagerange, e.g. "pp. 435-444".

    Returns
    -------
    start : str
        Start page.
    end : str
        End page.
    """

    try:
        pr = re.compile("pp\.\s([0-9]+)\-([0-9]+)")
        start, end = re.findall(pr, pagerange)[0]
    except IndexError:
        start = end = 0

    return str(start), str(end)

def _handle_pubdate(pubdate):
    """
    Yields a date integer from DfR pubdate field.
    """

    return int(pubdate[0:4])

def _handle_authors(authors):
    """
    Yields aulast and auinit lists from value of authors node.

    Parameters
    ----------
    authors : list, str, or unicode
        Value or values of 'author' element in DfR XML.

    Returns
    -------
    aulast : list
        A list of author surnames (string).
    auinit : list
        A list of author first-initials (string).
    """

    aulast = []
    auinit = []
    if type(authors) is list:
        for author in authors:
            author = str(strip_non_ascii(author))
            try:
                l,i = _handle_author(author)
                aulast.append(l)
                auinit.append(i)
            except ValueError:
                pass
    elif type(authors) is str or type(authors) is unicode:
        author = str(strip_non_ascii(authors))
        try:
            l,i = _handle_author(author)
            aulast.append(l)
            auinit.append(i)
        except ValueError:
            pass
    else:
        raise ValueError("authors must be a list or a string")

    return aulast, auinit

def _handle_author(author):
    """
    Yields aulast and auinit from an author's full name.

    Parameters
    ----------
    author : str or unicode
        Author fullname, e.g. "Richard L. Nixon".

    Returns
    -------
    aulast : str
        Author surname.
    auinit : str
        Author first-initial.
    """

    lname = author.split(' ')

    try:
        auinit = lname[0][0]
        final = lname[-1].upper()
        if final in ['JR.', 'III']:
            aulast = lname[-2].upper() + " " + final.strip(".")
        else:
            aulast = final
    except IndexError:
        raise ValueError("malformed author name")

    return str(aulast), str(auinit)

def _dfr2paper_map():
    """
    Defines the direct relationships between DfR article elements and
    :class:`.Paper` fields.

    Returns
    -------
    translator : dict
        A 'translator' dictionary.
    """

    translator = {  'doi': 'doi',
                    'title': 'atitle',
                    'journaltitle': 'jtitle',
                    'volume': 'volume',
                    'issue': 'issue'    }

    return translator

def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
    """
    Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
    Returns 'Unknown paper' if all id components are missing (None).

    Parameters
    ----------
    Kwargs : dict
        A dictionary of keyword arguments.
    aulast : string
        Author surname.
    auinit: string
        Author initial(s).
    date : string
        Four-digit year.
    jtitle : string
        Title of the journal.

    Returns
    -------
    ayj : string
        Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
        missing (None).

    """
    if aulast is None:
        aulast = ''
    elif isinstance(aulast, list):
        aulast = aulast[0]

    if auinit is None:
        auinit = ''
    elif isinstance(auinit, list):
        auinit = auinit[0]

    if date is None:
        date = ''

    if jtitle is None:
        jtitle = ''

    ayj = aulast + ' ' + auinit + ' ' + str(date) + ' ' + jtitle

    if ayj == '   ':
        ayj = 'Unknown paper'

    return ayj.upper()
Navigation

Source code for tethne.readers.dfr

Quick search

Navigation