Source code for hepcrawl.spiders.arxiv_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for arXiv."""

import re

from scrapy import Request, Selector
from scrapy.spiders import XMLFeedSpider

from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
from ..utils import coll_cleanforthe, get_license, split_fullname
from ..items import HEPRecord
from ..loaders import HEPLoader

RE_CONFERENCE = re.compile(r'\b(%s)\b' % '|'.join(
    [re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U)
RE_THESIS = re.compile(r'\b(%s)\b' % '|'.join(
    [re.escape(word) for word in THESIS_WORDS]), re.I | re.U)


[docs]class ArxivSpider(XMLFeedSpider):
    """Spider for crawling arXiv.org OAI-PMH XML files.

    .. code-block:: console

        scrapy crawl arXiv -a source_file=file://`pwd`/tests/responses/arxiv/sample_arxiv_record.xml

    """

    name = 'arXiv'
    iterator = 'xml'
    itertag = 'OAI-PMH:record'
    namespaces = [
        ("OAI-PMH", "http://www.openarchives.org/OAI/2.0/")
    ]

    def __init__(self, source_file=None, **kwargs):
        """Construct Arxiv spider."""
        super(ArxivSpider, self).__init__(**kwargs)
        self.source_file = source_file

    def start_requests(self):
        yield Request(self.source_file)

[docs]    def parse_node(self, response, node):
        """Parse an arXiv XML exported file into a HEP record."""
        node.remove_namespaces()

        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', './/title/text()')
        record.add_xpath('abstract', './/abstract/text()')
        record.add_xpath('preprint_date', './/created/text()')
        record.add_xpath('dois', './/doi//text()')
        record.add_xpath('pubinfo_freetext', './/journal-ref//text()')
        record.add_value('source', 'arXiv')

        authors, collabs = self._get_authors_or_collaboration(node)
        record.add_value('authors', authors)
        record.add_value('collaborations', collabs)

        collections = ['HEP', 'Citeable', 'arXiv']
        comments = '; '.join(node.xpath('.//comments//text()').extract())
        if comments:
            pages, notes, doctype = self._parse_comments_info(comments)
            record.add_value('public_notes', notes)
            if pages:
                record.add_value('page_nr', pages)
            if doctype:
                collections.append(doctype)
        record.add_value('collections', collections)

        record.add_value(
            'report_numbers',
            self._get_arxiv_report_numbers(node)
        )

        categories = ' '.join(
            node.xpath('.//categories//text()').extract()
        ).split()
        record.add_value(
            'arxiv_eprints',
            self._get_arxiv_eprint(node, categories)
        )
        record.add_value(
            'external_system_numbers',
            self._get_ext_systems_number(node)
        )

        license = get_license(
            license_url=node.xpath('.//license//text()').extract_first()
        )
        record.add_value('license', license)

        parsed_record = dict(record.load_item())
        return parsed_record

    def _get_authors_or_collaboration(self, node):
        """Parse authors, affiliations; extract collaboration"""
        author_selectors = node.xpath('.//authors//author')

        # take 'for the' out of the general phrases and dont use it in
        # affiliations
        collab_phrases = [
            'consortium', ' collab ', 'collaboration', ' team', 'group',
            ' on behalf of ', ' representing ',
        ]
        inst_phrases = ['institute', 'university', 'department', 'center']

        authors = []
        collaboration = []
        for selector in author_selectors:
            author = Selector(text=selector.extract())
            forenames = ' '.join(
                author.xpath('.//forenames//text()').extract()
            )
            keyname = ' '.join(author.xpath('.//keyname//text()').extract())
            name_string = " %s %s " % (forenames, keyname)
            affiliations = author.xpath('.//affiliation//text()').extract()

            # collaborations in affiliation field? Cautious with 'for the' in
            # Inst names
            collab_in_aff = []
            for index, aff in enumerate(affiliations):
                if any(
                    phrase for phrase in collab_phrases
                    if phrase in aff.lower()
                ) and not any(
                    phrase for phrase in inst_phrases if phrase in aff.lower()
                ):
                    collab_in_aff.append(index)
            collab_in_aff.reverse()
            for index in collab_in_aff:
                coll, author_name = coll_cleanforthe(affiliations.pop(index))
                if coll and coll not in collaboration:
                    collaboration.append(coll)

            # Check if name is a collaboration, else append to authors
            collab_in_name = ' for the ' in name_string.lower() or any(
                phrase for phrase in collab_phrases
                if phrase in name_string.lower()
            )
            if collab_in_name:
                coll, author_name = coll_cleanforthe(name_string)
                if author_name:
                    surname, given_names = split_fullname(author_name)
                    authors.append({
                        'surname': surname,
                        'given_names': given_names,
                        'affiliations': [],
                    })
                if coll and coll not in collaboration:
                    collaboration.append(coll)
            elif name_string.strip() == ':':
                # everything up to now seems to be collaboration info
                for author_info in authors:
                    name_string = " %s %s " % \
                        (author_info['given_names'], author_info['surname'])
                    coll, author_name = coll_cleanforthe(name_string)
                    if coll and coll not in collaboration:
                        collaboration.append(coll)
                authors = []
            else:
                authors.append({
                    'surname': keyname,
                    'given_names': forenames,
                    'affiliations': [{"value": aff} for aff in affiliations]
                })
        return authors, collaboration

    def _parse_comments_info(self, comments):
        """Parse comments; extract doctype for ConferencePaper and Thesis"""
        notes = {}
        pages = ''
        doctype = ''

        notes = {'source': 'arXiv', 'value': comments}

        found_pages = re.search(r'(?i)(\d+)\s*pages?\b', comments)
        if found_pages:
            pages = found_pages.group(1)

        if RE_THESIS.search(comments):
            doctype = 'Thesis'
        elif RE_CONFERENCE.search(comments):
            doctype = 'ConferencePaper'

        return pages, notes, doctype

    def _get_arxiv_report_numbers(self, node):
        report_numbers = ','.join(node.xpath('.//report-no//text()').extract())
        if report_numbers:
            return [
                {
                    'source': '',
                    'value': rn.strip(),
                } for rn in report_numbers.split(',')
            ]
        return []

    def _get_arxiv_eprint(self, node, categories):
        return {
            'value': node.xpath('.//id//text()').extract_first(),
            'categories': categories
        }

    def _get_ext_systems_number(self, node):
        return {
            'institute': 'arXiv',
            'value': node.xpath('.//identifier//text()').extract_first()
        }
Source code for hepcrawl.spiders.arxiv_spider

HEPCrawl

Navigation

Related Topics