# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
"""Spider for arXiv."""
import re
from scrapy import Request, Selector
from scrapy.spiders import XMLFeedSpider
from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
from ..utils import coll_cleanforthe, get_license, split_fullname
from ..items import HEPRecord
from ..loaders import HEPLoader
RE_CONFERENCE = re.compile(r'\b(%s)\b' % '|'.join(
[re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U)
RE_THESIS = re.compile(r'\b(%s)\b' % '|'.join(
[re.escape(word) for word in THESIS_WORDS]), re.I | re.U)
[docs]class ArxivSpider(XMLFeedSpider):
"""Spider for crawling arXiv.org OAI-PMH XML files.
.. code-block:: console
scrapy crawl arXiv -a source_file=file://`pwd`/tests/responses/arxiv/sample_arxiv_record.xml
"""
name = 'arXiv'
iterator = 'xml'
itertag = 'OAI-PMH:record'
namespaces = [
("OAI-PMH", "http://www.openarchives.org/OAI/2.0/")
]
def __init__(self, source_file=None, **kwargs):
"""Construct Arxiv spider."""
super(ArxivSpider, self).__init__(**kwargs)
self.source_file = source_file
def start_requests(self):
yield Request(self.source_file)
[docs] def parse_node(self, response, node):
"""Parse an arXiv XML exported file into a HEP record."""
node.remove_namespaces()
record = HEPLoader(item=HEPRecord(), selector=node)
record.add_xpath('title', './/title/text()')
record.add_xpath('abstract', './/abstract/text()')
record.add_xpath('preprint_date', './/created/text()')
record.add_xpath('dois', './/doi//text()')
record.add_xpath('pubinfo_freetext', './/journal-ref//text()')
record.add_value('source', 'arXiv')
authors, collabs = self._get_authors_or_collaboration(node)
record.add_value('authors', authors)
record.add_value('collaborations', collabs)
collections = ['HEP', 'Citeable', 'arXiv']
comments = '; '.join(node.xpath('.//comments//text()').extract())
if comments:
pages, notes, doctype = self._parse_comments_info(comments)
record.add_value('public_notes', notes)
if pages:
record.add_value('page_nr', pages)
if doctype:
collections.append(doctype)
record.add_value('collections', collections)
record.add_value(
'report_numbers',
self._get_arxiv_report_numbers(node)
)
categories = ' '.join(
node.xpath('.//categories//text()').extract()
).split()
record.add_value(
'arxiv_eprints',
self._get_arxiv_eprint(node, categories)
)
record.add_value(
'external_system_numbers',
self._get_ext_systems_number(node)
)
license = get_license(
license_url=node.xpath('.//license//text()').extract_first()
)
record.add_value('license', license)
parsed_record = dict(record.load_item())
return parsed_record
def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
author_selectors = node.xpath('.//authors//author')
# take 'for the' out of the general phrases and dont use it in
# affiliations
collab_phrases = [
'consortium', ' collab ', 'collaboration', ' team', 'group',
' on behalf of ', ' representing ',
]
inst_phrases = ['institute', 'university', 'department', 'center']
authors = []
collaboration = []
for selector in author_selectors:
author = Selector(text=selector.extract())
forenames = ' '.join(
author.xpath('.//forenames//text()').extract()
)
keyname = ' '.join(author.xpath('.//keyname//text()').extract())
name_string = " %s %s " % (forenames, keyname)
affiliations = author.xpath('.//affiliation//text()').extract()
# collaborations in affiliation field? Cautious with 'for the' in
# Inst names
collab_in_aff = []
for index, aff in enumerate(affiliations):
if any(
phrase for phrase in collab_phrases
if phrase in aff.lower()
) and not any(
phrase for phrase in inst_phrases if phrase in aff.lower()
):
collab_in_aff.append(index)
collab_in_aff.reverse()
for index in collab_in_aff:
coll, author_name = coll_cleanforthe(affiliations.pop(index))
if coll and coll not in collaboration:
collaboration.append(coll)
# Check if name is a collaboration, else append to authors
collab_in_name = ' for the ' in name_string.lower() or any(
phrase for phrase in collab_phrases
if phrase in name_string.lower()
)
if collab_in_name:
coll, author_name = coll_cleanforthe(name_string)
if author_name:
surname, given_names = split_fullname(author_name)
authors.append({
'surname': surname,
'given_names': given_names,
'affiliations': [],
})
if coll and coll not in collaboration:
collaboration.append(coll)
elif name_string.strip() == ':':
# everything up to now seems to be collaboration info
for author_info in authors:
name_string = " %s %s " % \
(author_info['given_names'], author_info['surname'])
coll, author_name = coll_cleanforthe(name_string)
if coll and coll not in collaboration:
collaboration.append(coll)
authors = []
else:
authors.append({
'surname': keyname,
'given_names': forenames,
'affiliations': [{"value": aff} for aff in affiliations]
})
return authors, collaboration
def _parse_comments_info(self, comments):
"""Parse comments; extract doctype for ConferencePaper and Thesis"""
notes = {}
pages = ''
doctype = ''
notes = {'source': 'arXiv', 'value': comments}
found_pages = re.search(r'(?i)(\d+)\s*pages?\b', comments)
if found_pages:
pages = found_pages.group(1)
if RE_THESIS.search(comments):
doctype = 'Thesis'
elif RE_CONFERENCE.search(comments):
doctype = 'ConferencePaper'
return pages, notes, doctype
def _get_arxiv_report_numbers(self, node):
report_numbers = ','.join(node.xpath('.//report-no//text()').extract())
if report_numbers:
return [
{
'source': '',
'value': rn.strip(),
} for rn in report_numbers.split(',')
]
return []
def _get_arxiv_eprint(self, node, categories):
return {
'value': node.xpath('.//id//text()').extract_first(),
'categories': categories
}
def _get_ext_systems_number(self, node):
return {
'institute': 'arXiv',
'value': node.xpath('.//identifier//text()').extract_first()
}