Source code for hepcrawl.spiders.pos_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for POS."""

import re

from scrapy import Request, Selector
from scrapy.spiders import Spider
from urlparse import urljoin
from ..utils import get_license, get_first
from ..dateutils import create_valid_date
from ..items import HEPRecord
from ..loaders import HEPLoader


[docs]class POSSpider(Spider):
    """POS/Sissa crawler.

    Extracts from metadata:
    title, article-id, conf-acronym, authors, affiliations,
    publication-date, publisher, license, language, link

    .. code-block:: console

        scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml

    """
    name = 'PoS'
    pos_base_url = "https://pos.sissa.it/contribution?id="

    def __init__(self, source_file=None, **kwargs):
        """Construct POS spider."""
        super(POSSpider, self).__init__(**kwargs)
        self.source_file = source_file

    def start_requests(self):
        yield Request(self.source_file)

[docs]    def parse(self, response):
        """Get PDF information."""
        node = response.selector
        node.remove_namespaces()
        for record in node.xpath('.//record'):
            identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
            if identifier:
                # Probably all links lead to same place, so take first
                pos_url = "{0}{1}".format(self.pos_base_url, identifier)
                request = Request(pos_url, callback=self.scrape_pos_page)
                request.meta["url"] = response.url
                request.meta["record"] = record.extract()
                yield request

[docs]    def scrape_pos_page(self, response):
        """Parse a page for PDF link."""
        response.meta["pos_pdf_url"] = response.selector.xpath(
            "//a[contains(text(),'pdf')]/@href"
        ).extract_first()
        response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"])
        response.meta["pos_url"] = response.url
        return self.build_item(response)

[docs]    def build_item(self, response):
        """Parse an PoS XML exported file into a HEP record."""
        text = response.meta["record"]
        node = Selector(text=text, type="xml")
        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', '//metadata/pex-dc/title/text()')
        record.add_xpath('source', '//metadata/pex-dc/publisher/text()')

        record.add_value('external_system_numbers', self._get_ext_systems_number(node))

        license = get_license(
            license_text=node.xpath(
                ".//metadata/pex-dc/rights/text()"
            ).extract_first(),
        )
        record.add_value('license', license)

        date, year = self._get_date(node)
        if date:
            record.add_value('date_published', date)
        if year:
            record.add_value('journal_year', int(year))

        identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
        record.add_value('urls', response.meta['pos_url'])
        if response.meta['pos_pdf_url']:
            record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']})
        if identifier:
            pbn = re.split('[()]', identifier)
            if len(pbn) == 3:
                conf_acronym = pbn[1]
                article_id = pbn[2]
                record.add_value('journal_title', pbn[0])
                record.add_value('journal_volume', conf_acronym)
                record.add_value('journal_artid', article_id)
            else:
                record.add_value('pubinfo_freetext', identifier)

        language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
        if language:
            record.add_value('language', language)

        authors = self._get_authors(node)
        if authors:
            record.add_value('authors', authors)

        extra_data = self._get_extra_data(node)
        if extra_data:
            record.add_value('extra_data', extra_data)

        record.add_value('collections', ['HEP', 'ConferencePaper'])
        return record.load_item()

    def _get_ext_systems_number(self, node):
        return [
            {
                'institute': 'PoS',
                'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
            },
            {
                'institute': 'PoS',
                'value': node.xpath('.//identifier/text()').extract_first()
            },
        ]

    def _get_date(self, node):
        """Get article date."""
        date = ''
        year = ''
        full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first()
        date = create_valid_date(full_date)
        if date:
            year = date[0:4]
        return date, year

    def _get_authors(self, node):
        """Get article authors."""
        author_selectors = node.xpath('.//metadata/pex-dc/creator')
        authors = []
        for selector in author_selectors:
            auth_dict = {}
            author = Selector(text=selector.extract())
            auth_dict['raw_name'] = \
                get_first(author.xpath('.//name//text()').extract(), default='')
            for affiliation in author.xpath('.//affiliation//text()').extract():
                if 'affiliations' in auth_dict:
                    auth_dict['affiliations'].append({'value': affiliation})
                else:
                    auth_dict['affiliations'] = [{'value': affiliation}, ]
            if auth_dict:
                authors.append(auth_dict)
        return authors

    def _get_extra_data(self, node):
        """Get info to help selection - not for INSPIRE record"""
        extra_data = {}

        section = node.xpath(".//metadata/pex-dc/description/text()").extract_first()
        extra_data['section'] = section.split(';', 1)[-1].strip()
        return extra_data
Source code for hepcrawl.spiders.pos_spider

HEPCrawl

Navigation

Related Topics