# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
"""Spider for POS."""
import re
from scrapy import Request, Selector
from scrapy.spiders import Spider
from urlparse import urljoin
from ..utils import get_license, get_first
from ..dateutils import create_valid_date
from ..items import HEPRecord
from ..loaders import HEPLoader
[docs]class POSSpider(Spider):
"""POS/Sissa crawler.
Extracts from metadata:
title, article-id, conf-acronym, authors, affiliations,
publication-date, publisher, license, language, link
.. code-block:: console
scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml
"""
name = 'PoS'
pos_base_url = "https://pos.sissa.it/contribution?id="
def __init__(self, source_file=None, **kwargs):
"""Construct POS spider."""
super(POSSpider, self).__init__(**kwargs)
self.source_file = source_file
def start_requests(self):
yield Request(self.source_file)
[docs] def parse(self, response):
"""Get PDF information."""
node = response.selector
node.remove_namespaces()
for record in node.xpath('.//record'):
identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
if identifier:
# Probably all links lead to same place, so take first
pos_url = "{0}{1}".format(self.pos_base_url, identifier)
request = Request(pos_url, callback=self.scrape_pos_page)
request.meta["url"] = response.url
request.meta["record"] = record.extract()
yield request
[docs] def scrape_pos_page(self, response):
"""Parse a page for PDF link."""
response.meta["pos_pdf_url"] = response.selector.xpath(
"//a[contains(text(),'pdf')]/@href"
).extract_first()
response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"])
response.meta["pos_url"] = response.url
return self.build_item(response)
[docs] def build_item(self, response):
"""Parse an PoS XML exported file into a HEP record."""
text = response.meta["record"]
node = Selector(text=text, type="xml")
node.remove_namespaces()
record = HEPLoader(item=HEPRecord(), selector=node)
record.add_xpath('title', '//metadata/pex-dc/title/text()')
record.add_xpath('source', '//metadata/pex-dc/publisher/text()')
record.add_value('external_system_numbers', self._get_ext_systems_number(node))
license = get_license(
license_text=node.xpath(
".//metadata/pex-dc/rights/text()"
).extract_first(),
)
record.add_value('license', license)
date, year = self._get_date(node)
if date:
record.add_value('date_published', date)
if year:
record.add_value('journal_year', int(year))
identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first()
record.add_value('urls', response.meta['pos_url'])
if response.meta['pos_pdf_url']:
record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']})
if identifier:
pbn = re.split('[()]', identifier)
if len(pbn) == 3:
conf_acronym = pbn[1]
article_id = pbn[2]
record.add_value('journal_title', pbn[0])
record.add_value('journal_volume', conf_acronym)
record.add_value('journal_artid', article_id)
else:
record.add_value('pubinfo_freetext', identifier)
language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
if language:
record.add_value('language', language)
authors = self._get_authors(node)
if authors:
record.add_value('authors', authors)
extra_data = self._get_extra_data(node)
if extra_data:
record.add_value('extra_data', extra_data)
record.add_value('collections', ['HEP', 'ConferencePaper'])
return record.load_item()
def _get_ext_systems_number(self, node):
return [
{
'institute': 'PoS',
'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first()
},
{
'institute': 'PoS',
'value': node.xpath('.//identifier/text()').extract_first()
},
]
def _get_date(self, node):
"""Get article date."""
date = ''
year = ''
full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first()
date = create_valid_date(full_date)
if date:
year = date[0:4]
return date, year
def _get_authors(self, node):
"""Get article authors."""
author_selectors = node.xpath('.//metadata/pex-dc/creator')
authors = []
for selector in author_selectors:
auth_dict = {}
author = Selector(text=selector.extract())
auth_dict['raw_name'] = \
get_first(author.xpath('.//name//text()').extract(), default='')
for affiliation in author.xpath('.//affiliation//text()').extract():
if 'affiliations' in auth_dict:
auth_dict['affiliations'].append({'value': affiliation})
else:
auth_dict['affiliations'] = [{'value': affiliation}, ]
if auth_dict:
authors.append(auth_dict)
return authors
def _get_extra_data(self, node):
"""Get info to help selection - not for INSPIRE record"""
extra_data = {}
section = node.xpath(".//metadata/pex-dc/description/text()").extract_first()
extra_data['section'] = section.split(';', 1)[-1].strip()
return extra_data