Source code for tethne.readers.pubmed
"""
Methods for working with PubMed data are still under development. Please use
with care.
.. autosummary::
read
"""
import tethne.data as ds
import xml.etree.ElementTree as ET
import uuid
# PubMed functions
def _pubmed_file_id(filename):
"""Future (not implemented).
Given a filename (presumed to contain PubMed compatible IDs)
return an xml string for each article associated with that ID.
Parameters
----------
filename : string
Path to a file containing PubMed-compatible IDs.
Returns
-------
list : list
A list of XML strings.
"""
return None
[docs]def read(filepath):
"""
Given a file with PubMed XML, return a list of :class:`.Paper` instances.
See the following hyperlinks regarding possible structures of XML:
* http://www.ncbi.nlm.nih.gov/pmc/pmcdoc/tagging-guidelines/citations/v2/citationtags.html#2Articlewithmorethan10authors%28listthefirst10andaddetal%29
* http://dtd.nlm.nih.gov/publishing/
Each :class:`.Paper` is tagged with an accession id for this
read/conversion.
**Usage**
.. code-block:: python
>>> import tethne.readers as rd
>>> papers = rd.pubmed.read("/Path/to/PubMedData.xml")
Parameters
----------
filepath : string
Path to PubMed XML file.
Returns
-------
meta_list : list
A list of :class:`.Paper` instances.
"""
try:
with open(filepath,'r') as f:
tree = ET.fromstring(text, parser)(filepath)
root = tree.getroot()
except IOError: # File does not exist, or couldn't be read.
raise IOError("File does not exist, or cannot be read.")
accession = str(uuid.uuid4())
# define location of simple article meta data relative to xml tree rooted
# at 'article'
meta_loc = {'atitle':'./front/article-meta/title-group/article-title',
'jtitle':('./front/journal-meta/journal-title-group/' +
'journal-title'),
'volume':'./front/article-meta/volume',
'issue':'./front/article-meta/issue',
'spage':'./front/article-meta/fpage',
'epage':'./front/article-meta/lpage'}
# location relative to element-citation element
cit_meta_loc = {'atitle':'./article-title',
'jtitle':'./source',
'date':'./year',
'volume':'./volume',
'spage':'./fpage',
'epage':'./epage'}
meta_list = []
for article in root.iter('article'):
paper = ds.Paper()
# collect information from the 'front' section of the article
# collect the simple data
for key in meta_loc.iterkeys():
key_data = article.find(meta_loc[key])
if key_data is not None:
paper[key] = key_data.text
else:
paper[key] = None
# collect doi and pmid
id_list = article.findall('./front/article-meta/article-id')
for identifier in id_list:
id_type = identifier.get('pub-id-type')
if id_type == 'doi':
paper['doi'] = identifier.text
elif id_type == 'pmid':
paper['pmid'] = identifier.text
else:
# if never found, remain at None from initialization
pass
# collect aulast and auinint
aulast = []
auinit = []
contribs = article.findall('./front/article-meta/contrib-group/contrib')
# if contrib is not found then loop is skipped
for contrib in contribs:
contrib_type = contrib.get('contrib-type')
if contrib_type == 'author':
surname = contrib.find('./name/surname')
if surname is not None:
# then it was found
aulast.append(surname.text)
else:
aulast.append(None)
# multiple given names? this takes first one
given_name = contrib.find('./name/given-names')
if given_name is not None:
# then it was found
auinit.append(given_name.text[0])
else:
auinit.append(None)
paper['aulast'] = aulast
paper['auinit'] = auinit
# collect date
pub_dates = article.findall('./front/article-meta/pub-date')
# if pub-date is not found then loop is skipped
for pub_date in pub_dates:
pub_type = pub_date.get('pub-type')
print pub_type
if pub_type == 'collection':
year = pub_date.find('./year')
if year is not None:
# then it was found
paper['date'] = year.text
else:
paper['date'] = None
meta_list.append(paper)
# construct ayjid
paper['ayjid'] = create_ayjid(**paper) # THIS IS BROKEN.
# citations
citations_list = []
# element-citation handling different from mixed-citation handling
citations = article.findall('./back/ref-list/ref/element-citation')
for cite in citations:
cite_dict = ds.Paper()
# simple meta data
for key in cit_meta_loc.iterkeys():
key_data = cite.find(cit_meta_loc[key])
if key_data is not None:
paper[key] = key_data.text
else:
paper[key] = None
# doi and pmid
pub_id = cite.find('./pub-id')
if pub_id is not None:
pub_id_type = pub_id.get('pub-id-type')
if pub_id_type == 'doi':
cite_dict['doi'] = pub_id.text
elif pub_id_type == 'pmid':
cite_dict['pmid'] = pub_id.text
# aulast and auinit
cite_aulast = []
cite_auinit = []
# determine if person group is authors
person_group = cite.find('./person-group')
if person_group is not None:
group_type = person_group.get('person-group-type')
else:
group_type = None
# then add the authors to the cite_dict
if group_type == 'author':
names = person_group.findall('./name')
for name in names:
# add surname
surname = name.find('./surname')
if surname is not None:
# then it was found
cite_aulast.append(surname.text)
else:
cite_aulast.append(None)
# add given names
given_names = name.find('./given-names')
if given_names is not None:
# then it was found
cite_auinit.append(given_names.text[0])
else:
cite_auinit.append(None)
if not cite_aulast:
# then empty
cite_aulast = None
if not cite_auinit:
# then empty
cite_auinit = None
cite_dict['aulast'] = cite_aulast
cite_dict['auinit'] = cite_auinit
citations_list.append(cite_dict)
# end cite loop
paper['citations'] = citations_list
paper['accession'] = accession
meta_list.append(paper)
# end article loop
return meta_list
def _expand_pubmed(meta_list):
"""Future (not implemented).
Given a list of first-level meta dicts and their second-level meta dicts,
first['citations'], expand the network by adding the second-level meta
dicts to the first level. That is, for the second-level meta dicts with
sufficient information (either a DOI, PubMed ID, enough metadata to
query for a DOI, etc.), query PubMed for their more expansive set
of meta data, most notably their citation data, parse the associated xml,
and append their :class:`.Paper` to the meta_list.
Parameters
----------
meta_list : list
A list of :class:`.Paper` instances.
Returns
-------
list : list
A list of :class:`.Paper` instances.
Notes
-----
Do something about the redundent information about them stored still in the
first level?
"""
pass