Source code for hepcrawl.spiders.wsp_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for World Scientific."""

from __future__ import absolute_import, print_function

import os
import urlparse

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from ..extractors.jats import Jats
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    ftp_list_files,
    ftp_connection_info,
    get_license,
    unzip_xml_files,
)


[docs]class WorldScientificSpider(Jats, XMLFeedSpider):
    """World Scientific Proceedings crawler.

    This spider connects to a given FTP hosts and downloads zip files with
    XML files for extraction into HEP records.

    This means that it generates the URLs for Scrapy to crawl in a special way:

    1. First it connects to a FTP host and lists all the new ZIP files found
       on the remote server and downloads them to a designated local folder,
       using `start_requests()`.

    2. Then the ZIP file is unpacked and it lists all the XML files found
       inside, via `handle_package()`. Note the callback from `start_requests()`

    3. Finally, now each XML file is parsed via `parse_node()`.

    To run a crawl, you need to pass FTP connection information via
    `ftp_host` and `ftp_netrc`:``

    .. code-block:: console

        scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'


    Happy crawling!
    """

    name = 'WSP'
    custom_settings = {}
    start_urls = []
    iterator = 'iternodes'  # This is actually unnecessary, since it's the default value
    itertag = 'article'

    allowed_article_types = [
        'research-article',
        'corrected-article',
        'original-article',
        'introduction',
        'letter',
        'correction',
        'addendum',
        'review-article',
        'rapid-communications'
    ]

    def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs):
        """Construct WSP spider."""
        super(WorldScientificSpider, self).__init__(*args, **kwargs)
        self.ftp_folder = ftp_folder
        self.ftp_host = ftp_host
        self.ftp_netrc = ftp_netrc
        self.target_folder = "/tmp/WSP"
        self.package_path = package_path
        if not os.path.exists(self.target_folder):
            os.makedirs(self.target_folder)

[docs]    def start_requests(self):
        """List selected folder on remote FTP and yield new zip files."""
        if self.package_path:
            yield Request(self.package_path, callback=self.handle_package_file)
        else:
            ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
            dummy, new_files = ftp_list_files(
                self.ftp_folder,
                self.target_folder,
                server=ftp_host,
                user=ftp_params['ftp_user'],
                password=ftp_params['ftp_password']
            )
            for remote_file in new_files:
                # Cast to byte-string for scrapy compatibility
                remote_file = str(remote_file)
                ftp_params["ftp_local_filename"] = os.path.join(
                    self.target_folder,
                    os.path.basename(remote_file)
                )
                remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
                yield Request(
                    str(remote_url),
                    meta=ftp_params,
                    callback=self.handle_package_ftp
                )

[docs]    def handle_package_ftp(self, response):
        """Handle a zip package and yield every XML found."""
        self.log("Visited %s" % response.url)
        zip_filepath = response.body
        zip_target_folder, dummy = os.path.splitext(zip_filepath)
        xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
        for xml_file in xml_files:
            yield Request(
                "file://{0}".format(xml_file),
                meta={"package_path": zip_filepath}
            )

[docs]    def handle_package_file(self, response):
        """Handle a local zip package and yield every XML."""
        zip_filepath = urlparse.urlsplit(response.url).path
        zip_target_folder, dummy = os.path.splitext(zip_filepath)
        xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
        for xml_file in xml_files:
            yield Request(
                "file://{0}".format(xml_file),
                meta={"package_path": zip_filepath}
            )

[docs]    def parse_node(self, response, node):
        """Parse a WSP XML file into a HEP record."""
        node.remove_namespaces()
        article_type = node.xpath('@article-type').extract()
        self.log("Got article_type {0}".format(article_type))
        if article_type is None or article_type[0] not in self.allowed_article_types:
            # Filter out non-interesting article types
            return None

        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        if article_type in ['correction',
                            'addendum']:
            record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")
            record.add_value('journal_doctype', article_type)
        record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()")
        record.add_xpath('page_nr', "//counts/page-count/@count")

        record.add_xpath('abstract', '//abstract[1]')
        record.add_xpath('title', '//article-title/text()')
        record.add_xpath('subtitle', '//subtitle/text()')

        record.add_value('authors', self._get_authors(node))
        record.add_xpath('collaborations', "//contrib/collab/text()")

        free_keywords, classification_numbers = self._get_keywords(node)
        record.add_value('free_keywords', free_keywords)
        record.add_value('classification_numbers', classification_numbers)

        record.add_value('date_published', self._get_published_date(node))

        # TODO: Special journal title handling
        # journal, volume = fix_journal_name(journal, self.journal_mappings)
        # volume += get_value_in_tag(self.document, 'volume')
        journal_title = '//abbrev-journal-title/text()|//journal-title/text()'
        record.add_xpath('journal_title', journal_title)
        record.add_xpath('journal_issue', '//issue/text()')
        record.add_xpath('journal_volume', '//volume/text()')
        record.add_xpath('journal_artid', '//elocation-id/text()')

        record.add_xpath('journal_fpage', '//fpage/text()')
        record.add_xpath('journal_lpage', '//lpage/text()')

        published_date = self._get_published_date(node)
        record.add_value('journal_year', int(published_date[:4]))
        record.add_value('date_published', published_date)

        record.add_xpath('copyright_holder', '//copyright-holder/text()')
        record.add_xpath('copyright_year', '//copyright-year/text()')
        record.add_xpath('copyright_statement', '//copyright-statement/text()')
        record.add_value('copyright_material', 'Article')

        license = get_license(
            license_url=node.xpath(
                '//license/license-p/ext-link/@href').extract_first(),
            license_text=node.xpath(
                '//license/license-p/ext-link/text()').extract_first(),
        )
        record.add_value('license', license)

        record.add_value('collections', self._get_collections(node, article_type, journal_title))
        parsed_record = dict(record.load_item())

        return parsed_record

    def _get_collections(self, node, article_type, current_journal_title):
        """Return this articles' collection."""
        conference = node.xpath('.//conference').extract()
        if conference or current_journal_title == "International Journal of Modern Physics: Conference Series":
            return ['HEP', 'ConferencePaper']
        elif article_type == "review-article":
            return ['HEP', 'Review']
        else:
            return ['HEP', 'Published']
Source code for hepcrawl.spiders.wsp_spider

HEPCrawl

Navigation

Related Topics