"""
Methods for parsing JSTOR Data-for-Research datasets.
.. autosummary::
ngrams
read
"""
import tethne.data as dt
import os
import xml.etree.ElementTree as ET
import re
from tethne.utilities import dict_from_node, strip_non_ascii
from nltk.corpus import stopwords
import uuid
[docs]def read(datapath):
"""
Yields :class:`.Paper` s from JSTOR DfR package.
Each :class:`.Paper` is tagged with an accession id for this
read/conversion.
Parameters
----------
filepath : string
Filepath to unzipped JSTOR DfR folder containing a citations.XML file.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> papers = rd.dfr.read("/Path/to/DfR")
"""
try:
with open(datapath + "/citations.XML", 'rb') as f:
data = f.read().replace('&', '&')
root = ET.fromstring(data)
except IOError:
raise IOError(datapath+"citations.XML not found.")
accession = str(uuid.uuid4())
papers = []
for article in root:
paper = _handle_paper(article)
paper['accession'] = accession
papers.append(paper)
return papers
[docs]def from_dir(path):
"""
Convenience function for generating a list of :class:`.Paper` from a
directory of JSTOR DfR datasets.
Parameters
----------
path : string
Path to directory containing DfR dataset directories.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Raises
------
IOError
Invalid path.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> papers = rd.dfr.from_dir("/Path/to/datadir")
"""
papers = []
try:
files = os.listdir(path)
except IOError:
raise IOError("Invalid path.") # Ignore hidden files.
for f in files:
if not f.startswith('.') and os.path.isdir(path + "/" + f):
try:
papers += read(path + "/" + f)
except (IOError, UnboundLocalError): # Ignore directories that
pass # don't contain DfR data.
return papers
[docs]def ngrams(datapath, N='bi', ignore_hash=True, apply_stoplist=False):
"""
Yields N-grams from a JSTOR DfR dataset.
Parameters
----------
filepath : string
Filepath to unzipped JSTOR DfR folder containing N-grams (e.g.
'bigrams').
N : string
'bi', 'tri', or 'quad'
ignore_hash : bool
If True, will exclude all N-grams that contain the hash '#' character.
apply_stoplist : bool
If True, will exclude all N-grams that contain words in the NLTK
stoplist.
Returns
-------
ngrams : dict
Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> trigrams = rd.dfr.ngrams("/Path/to/DfR", N='tri')
"""
gram_path = datapath + "/" + N + "grams"
ngrams = {}
for file in os.listdir(gram_path):
if file.split('.')[-1] == 'XML':
root = ET.parse(gram_path + "/" + file).getroot()
doi = root.attrib['id']
grams = [ (gram.text.strip(), int(gram.attrib['weight']) )
for gram in root.findall(N + 'gram') # v Hashes v
if not ignore_hash or '#' not in list(gram.text) ]
if apply_stoplist:
stoplist = stoplist.words()
grams_ = []
for g,c in grams:
for w in g.split():
if w not in stoplist:
grams_.append( (g,c) )
grams = grams_
ngrams[doi] = grams
return ngrams
def _handle_paper(article):
"""
Yields a :class:`.Paper` from an article ET node.
Parameters
----------
article : Element
ElementTree Element 'article'.
Returns
-------
paper : :class:`.Paper`
"""
paper = dt.Paper()
pdata = dict_from_node(article)
# Direct mappings.
translator = _dfr2paper_map()
for key, value in translator.iteritems():
try:
paper[value] = str(pdata[key]).upper()
except KeyError: # Article may not have all keys of interest.
pass
# Handle author names.
paper['aulast'], paper['auinit'] = _handle_authors(pdata['author'])
# Handle pubdate.
paper['date'] = _handle_pubdate(pdata['pubdate'])
# Handle pagerange.
paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange'])
# Generate ayjid.
try:
paper['ayjid'] = _create_ayjid(paper['aulast'][0], paper['auinit'][0], \
paper['date'], paper['jtitle'])
except IndexError: # Article may not have authors.
pass
return paper
def _handle_pagerange(pagerange):
"""
Yields start and end pages from DfR pagerange field.
Parameters
----------
pagerange : str or unicode
DfR-style pagerange, e.g. "pp. 435-444".
Returns
-------
start : str
Start page.
end : str
End page.
"""
try:
pr = re.compile("pp\.\s([0-9]+)\-([0-9]+)")
start, end = re.findall(pr, pagerange)[0]
except IndexError:
start = end = 0
return str(start), str(end)
def _handle_pubdate(pubdate):
"""
Yields a date integer from DfR pubdate field.
"""
return int(pubdate[0:4])
def _handle_authors(authors):
"""
Yields aulast and auinit lists from value of authors node.
Parameters
----------
authors : list, str, or unicode
Value or values of 'author' element in DfR XML.
Returns
-------
aulast : list
A list of author surnames (string).
auinit : list
A list of author first-initials (string).
"""
aulast = []
auinit = []
if type(authors) is list:
for author in authors:
author = str(strip_non_ascii(author))
try:
l,i = _handle_author(author)
aulast.append(l)
auinit.append(i)
except ValueError:
pass
elif type(authors) is str or type(authors) is unicode:
author = str(strip_non_ascii(authors))
try:
l,i = _handle_author(author)
aulast.append(l)
auinit.append(i)
except ValueError:
pass
else:
raise ValueError("authors must be a list or a string")
return aulast, auinit
def _handle_author(author):
"""
Yields aulast and auinit from an author's full name.
Parameters
----------
author : str or unicode
Author fullname, e.g. "Richard L. Nixon".
Returns
-------
aulast : str
Author surname.
auinit : str
Author first-initial.
"""
lname = author.split(' ')
try:
auinit = lname[0][0]
final = lname[-1].upper()
if final in ['JR.', 'III']:
aulast = lname[-2].upper() + " " + final.strip(".")
else:
aulast = final
except IndexError:
raise ValueError("malformed author name")
return str(aulast), str(auinit)
def _dfr2paper_map():
"""
Defines the direct relationships between DfR article elements and
:class:`.Paper` fields.
Returns
-------
translator : dict
A 'translator' dictionary.
"""
translator = { 'doi': 'doi',
'title': 'atitle',
'journaltitle': 'jtitle',
'volume': 'volume',
'issue': 'issue' }
return translator
def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
"""
Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
Returns 'Unknown paper' if all id components are missing (None).
Parameters
----------
Kwargs : dict
A dictionary of keyword arguments.
aulast : string
Author surname.
auinit: string
Author initial(s).
date : string
Four-digit year.
jtitle : string
Title of the journal.
Returns
-------
ayj : string
Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
missing (None).
"""
if aulast is None:
aulast = ''
elif isinstance(aulast, list):
aulast = aulast[0]
if auinit is None:
auinit = ''
elif isinstance(auinit, list):
auinit = auinit[0]
if date is None:
date = ''
if jtitle is None:
jtitle = ''
ayj = aulast + ' ' + auinit + ' ' + str(date) + ' ' + jtitle
if ayj == ' ':
ayj = 'Unknown paper'
return ayj.upper()