# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
"""Spider for PHENIX."""
from __future__ import absolute_import, print_function
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import XMLFeedSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
[docs]class PhenixSpider(XMLFeedSpider):
"""PHENIX crawler
Scrapes theses metadata from PHENIX experiment web page.
http://www.phenix.bnl.gov/WWW/talk/theses.php
1. parse() iterates through every record on the html page and yields
a HEPRecord.
Example usage:
.. code-block:: console
scrapy crawl phenix
scrapy crawl phenix -a source_file=file://`pwd`/tests/responses/phenix/test_list.html -s "JSON_OUTPUT_DIR=tmp/"
Happy crawling!
"""
name = 'phenix'
start_urls = ["http://www.phenix.bnl.gov/WWW/talk/theses.php"]
domain = "http://www.phenix.bnl.gov"
iterator = "html"
itertag = "//table//td/ul/li"
def __init__(self, source_file=None, *args, **kwargs):
"""Construct PHENIX spider"""
super(PhenixSpider, self).__init__(*args, **kwargs)
self.source_file = source_file
[docs] def start_requests(self):
"""You can also run the spider on local test files"""
if self.source_file:
yield Request(self.source_file)
elif self.start_urls:
for url in self.start_urls:
yield Request(url)
@staticmethod
[docs] def parse_datablock(node):
"""Get data out of the text block where there's
title, affiliation and year
"""
datablock = node.xpath("./text()").extract()[0]
datalist = datablock.strip().split(",")
thesis_type = None
if "Ph.D." in datablock:
thesis_type = "PhD"
title = datablock.split('"')[1]
datalist = [el for el in datalist if "archive" not in el]
year = datalist.pop().strip()
affline = datalist.pop().strip()
stop_words = {"Ph.D.", "Master", "thesis", "at"}
affiliation = " ".join(
[w for w in affline.split() if w not in stop_words])
return title, year, affiliation, thesis_type
[docs] def get_authors(self, node):
"""Return authors dictionary """
author = node.xpath("./b/text()").extract()
authors = []
_, _, affiliation, _ = self.parse_datablock(node)
for aut in author:
authors.append({
'raw_name': aut,
'affiliations': [{"value": affiliation}]
})
return authors
[docs] def add_fft_file(self, pdf_files, file_access, file_type):
"""Create a structured dictionary and add to 'files' item."""
file_dicts = []
for link in pdf_files:
file_dict = {
"access": file_access,
"description": self.name.title(),
"url": urljoin(self.domain, link),
"type": file_type,
}
file_dicts.append(file_dict)
return file_dicts
[docs] def parse_node(self, response, node):
"""Parse PHENIX web page into a HEP record."""
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
title, year, _, thesis_type = self.parse_datablock(node)
if not thesis_type:
return None
pdf_files = node.xpath(".//a/@href").extract()
record.add_value('additional_files', self.add_fft_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value('authors', self.get_authors(node))
record.add_value('date_published', year)
record.add_value('thesis', {'degree_type': thesis_type})
record.add_value('title', title)
record.add_value('urls', self.start_urls)
record.add_value('source', 'PHENIX')
record.add_value('collections', ['HEP', 'THESIS'])
return record.load_item()