Source code for hepcrawl.spiders.wsp_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for World Scientific."""

from __future__ import absolute_import, print_function

import os
import urlparse

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from ..extractors.jats import Jats
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    ftp_list_files,
    ftp_connection_info,
    get_license,
    unzip_xml_files,
)


[docs]class WorldScientificSpider(Jats, XMLFeedSpider): """World Scientific Proceedings crawler. This spider connects to a given FTP hosts and downloads zip files with XML files for extraction into HEP records. This means that it generates the URLs for Scrapy to crawl in a special way: 1. First it connects to a FTP host and lists all the new ZIP files found on the remote server and downloads them to a designated local folder, using `start_requests()`. 2. Then the ZIP file is unpacked and it lists all the XML files found inside, via `handle_package()`. Note the callback from `start_requests()` 3. Finally, now each XML file is parsed via `parse_node()`. To run a crawl, you need to pass FTP connection information via `ftp_host` and `ftp_netrc`:`` .. code-block:: console scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' Happy crawling! """ name = 'WSP' custom_settings = {} start_urls = [] iterator = 'iternodes' # This is actually unnecessary, since it's the default value itertag = 'article' allowed_article_types = [ 'research-article', 'corrected-article', 'original-article', 'introduction', 'letter', 'correction', 'addendum', 'review-article', 'rapid-communications' ] def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs): """Construct WSP spider.""" super(WorldScientificSpider, self).__init__(*args, **kwargs) self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc self.target_folder = "/tmp/WSP" self.package_path = package_path if not os.path.exists(self.target_folder): os.makedirs(self.target_folder)
[docs] def start_requests(self): """List selected folder on remote FTP and yield new zip files.""" if self.package_path: yield Request(self.package_path, callback=self.handle_package_file) else: ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) dummy, new_files = ftp_list_files( self.ftp_folder, self.target_folder, server=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) for remote_file in new_files: # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( self.target_folder, os.path.basename(remote_file) ) remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) yield Request( str(remote_url), meta=ftp_params, callback=self.handle_package_ftp )
[docs] def handle_package_ftp(self, response): """Handle a zip package and yield every XML found.""" self.log("Visited %s" % response.url) zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), meta={"package_path": zip_filepath} )
[docs] def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" zip_filepath = urlparse.urlsplit(response.url).path zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), meta={"package_path": zip_filepath} )
[docs] def parse_node(self, response, node): """Parse a WSP XML file into a HEP record.""" node.remove_namespaces() article_type = node.xpath('@article-type').extract() self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[0] not in self.allowed_article_types: # Filter out non-interesting article types return None record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', 'addendum']: record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href") record.add_value('journal_doctype', article_type) record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()") record.add_xpath('page_nr', "//counts/page-count/@count") record.add_xpath('abstract', '//abstract[1]') record.add_xpath('title', '//article-title/text()') record.add_xpath('subtitle', '//subtitle/text()') record.add_value('authors', self._get_authors(node)) record.add_xpath('collaborations', "//contrib/collab/text()") free_keywords, classification_numbers = self._get_keywords(node) record.add_value('free_keywords', free_keywords) record.add_value('classification_numbers', classification_numbers) record.add_value('date_published', self._get_published_date(node)) # TODO: Special journal title handling # journal, volume = fix_journal_name(journal, self.journal_mappings) # volume += get_value_in_tag(self.document, 'volume') journal_title = '//abbrev-journal-title/text()|//journal-title/text()' record.add_xpath('journal_title', journal_title) record.add_xpath('journal_issue', '//issue/text()') record.add_xpath('journal_volume', '//volume/text()') record.add_xpath('journal_artid', '//elocation-id/text()') record.add_xpath('journal_fpage', '//fpage/text()') record.add_xpath('journal_lpage', '//lpage/text()') published_date = self._get_published_date(node) record.add_value('journal_year', int(published_date[:4])) record.add_value('date_published', published_date) record.add_xpath('copyright_holder', '//copyright-holder/text()') record.add_xpath('copyright_year', '//copyright-year/text()') record.add_xpath('copyright_statement', '//copyright-statement/text()') record.add_value('copyright_material', 'Article') license = get_license( license_url=node.xpath( '//license/license-p/ext-link/@href').extract_first(), license_text=node.xpath( '//license/license-p/ext-link/text()').extract_first(), ) record.add_value('license', license) record.add_value('collections', self._get_collections(node, article_type, journal_title)) parsed_record = dict(record.load_item()) return parsed_record
def _get_collections(self, node, article_type, current_journal_title): """Return this articles' collection.""" conference = node.xpath('.//conference').extract() if conference or current_journal_title == "International Journal of Modern Physics: Conference Series": return ['HEP', 'ConferencePaper'] elif article_type == "review-article": return ['HEP', 'Review'] else: return ['HEP', 'Published']