Source code for tethne.readers.pubmed

"""
Methods for working with PubMed data are still under development. Please use
with care.

.. autosummary::

   read

"""

import tethne.data as ds
import xml.etree.ElementTree as ET
import uuid


# PubMed functions
def _pubmed_file_id(filename):
    """Future (not implemented).

    Given a filename (presumed to contain PubMed compatible IDs)
    return an xml string for each article associated with that ID.

    Parameters
    ----------
    filename : string
        Path to a file containing PubMed-compatible IDs.

    Returns
    -------
    list : list
        A list of XML strings.

    """

    return None

[docs]def read(filepath):
    """
    Given a file with PubMed XML, return a list of :class:`.Paper` instances.

    See the following hyperlinks regarding possible structures of XML:
    * http://www.ncbi.nlm.nih.gov/pmc/pmcdoc/tagging-guidelines/citations/v2/citationtags.html#2Articlewithmorethan10authors%28listthefirst10andaddetal%29
    * http://dtd.nlm.nih.gov/publishing/

    Each :class:`.Paper` is tagged with an accession id for this
    read/conversion.

    **Usage**

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.pubmed.read("/Path/to/PubMedData.xml")

    Parameters
    ----------
    filepath : string
        Path to PubMed XML file.

    Returns
    -------
    meta_list : list
        A list of :class:`.Paper` instances.
    """

    try:
        with open(filepath,'r') as f:
            tree = ET.fromstring(text, parser)(filepath)
            root = tree.getroot()

    except IOError: # File does not exist, or couldn't be read.
        raise IOError("File does not exist, or cannot be read.")

    accession = str(uuid.uuid4())

    # define location of simple article meta data relative to xml tree rooted
    # at 'article'
    meta_loc = {'atitle':'./front/article-meta/title-group/article-title',
                'jtitle':('./front/journal-meta/journal-title-group/' +
                          'journal-title'),
                'volume':'./front/article-meta/volume',
                'issue':'./front/article-meta/issue',
                'spage':'./front/article-meta/fpage',
                'epage':'./front/article-meta/lpage'}

    # location relative to element-citation element
    cit_meta_loc = {'atitle':'./article-title',
                    'jtitle':'./source',
                    'date':'./year',
                    'volume':'./volume',
                    'spage':'./fpage',
                    'epage':'./epage'}

    meta_list = []
    for article in root.iter('article'):
        paper = ds.Paper()

        # collect information from the 'front' section of the article
        # collect the simple data
        for key in meta_loc.iterkeys():
            key_data = article.find(meta_loc[key])
            if key_data is not None:
                paper[key] = key_data.text
            else:
                paper[key] = None

        # collect doi and pmid
        id_list = article.findall('./front/article-meta/article-id')
        for identifier in id_list:
            id_type = identifier.get('pub-id-type')
            if id_type == 'doi':
                paper['doi'] = identifier.text
            elif id_type == 'pmid':
                paper['pmid'] = identifier.text
            else:
                # if never found, remain at None from initialization
                pass

        # collect aulast and auinint
        aulast = []
        auinit = []
        contribs = article.findall('./front/article-meta/contrib-group/contrib')
        # if contrib is not found then loop is skipped
        for contrib in contribs:
            contrib_type = contrib.get('contrib-type')
            if contrib_type == 'author':
                surname = contrib.find('./name/surname')
                if surname is not None:
                    # then it was found
                    aulast.append(surname.text)
                else:
                    aulast.append(None)

                # multiple given names? this takes first one
                given_name = contrib.find('./name/given-names')
                if given_name is not None:
                    # then it was found
                    auinit.append(given_name.text[0])
                else:
                    auinit.append(None)
        paper['aulast'] = aulast
        paper['auinit'] = auinit

        # collect date
        pub_dates = article.findall('./front/article-meta/pub-date')
        # if pub-date is not found then loop is skipped
        for pub_date in pub_dates:
            pub_type = pub_date.get('pub-type')
            print pub_type
            if pub_type == 'collection':
                year = pub_date.find('./year')
                if year is not None:
                    # then it was found
                    paper['date'] = year.text
                else:
                    paper['date'] = None

        meta_list.append(paper)

        # construct ayjid
        paper['ayjid'] = create_ayjid(**paper)  # THIS IS BROKEN.

        # citations
        citations_list = []

        # element-citation handling different from mixed-citation handling
        citations = article.findall('./back/ref-list/ref/element-citation')
        for cite in citations:
            cite_dict = ds.Paper()

            # simple meta data
            for key in cit_meta_loc.iterkeys():
                key_data = cite.find(cit_meta_loc[key])
                if key_data is not None:
                    paper[key] = key_data.text
                else:
                    paper[key] = None

            # doi and pmid
            pub_id = cite.find('./pub-id')
            if pub_id is not None:
                pub_id_type = pub_id.get('pub-id-type')
                if pub_id_type == 'doi':
                    cite_dict['doi'] = pub_id.text
                elif pub_id_type == 'pmid':
                    cite_dict['pmid'] = pub_id.text

            # aulast and auinit
            cite_aulast = []
            cite_auinit = []

            # determine if person group is authors
            person_group = cite.find('./person-group')
            if person_group is not None:
                group_type = person_group.get('person-group-type')
            else:
                group_type = None

            # then add the authors to the cite_dict
            if group_type == 'author':
                names = person_group.findall('./name')
                for name in names:
                    # add surname
                    surname = name.find('./surname')
                    if surname is not None:
                        # then it was found
                        cite_aulast.append(surname.text)
                    else:
                        cite_aulast.append(None)

                    # add given names
                    given_names = name.find('./given-names')
                    if given_names is not None:
                        # then it was found
                        cite_auinit.append(given_names.text[0])
                    else:
                        cite_auinit.append(None)

            if not cite_aulast:
                # then empty
                cite_aulast = None
            if not cite_auinit:
                # then empty
                cite_auinit = None

            cite_dict['aulast'] = cite_aulast
            cite_dict['auinit'] = cite_auinit

            citations_list.append(cite_dict)
        # end cite loop

        paper['citations'] = citations_list

        paper['accession'] = accession

        meta_list.append(paper)
    # end article loop

    return meta_list

def _expand_pubmed(meta_list):
    """Future (not implemented).

    Given a list of first-level meta dicts and their second-level meta dicts,
    first['citations'], expand the network by adding the second-level meta
    dicts to the first level. That is, for the second-level meta dicts with
    sufficient information (either a DOI, PubMed ID, enough metadata to
    query for a DOI, etc.), query PubMed for their more expansive set
    of meta data, most notably their citation data, parse the associated xml,
    and append their :class:`.Paper` to the meta_list.

    Parameters
    ----------
    meta_list : list
        A list of :class:`.Paper` instances.

    Returns
    -------
    list : list
        A list of :class:`.Paper` instances.

    Notes
    -----
    Do something about the redundent information about them stored still in the
    first level?

    """
    pass
Navigation

Source code for tethne.readers.pubmed

Quick search

Navigation