Source code for hepcrawl.spiders.phenix_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for PHENIX."""

from __future__ import absolute_import, print_function

from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from ..items import HEPRecord
from ..loaders import HEPLoader


[docs]class PhenixSpider(XMLFeedSpider):

    """PHENIX crawler
    Scrapes theses metadata from PHENIX experiment web page.
    http://www.phenix.bnl.gov/WWW/talk/theses.php

    1. parse() iterates through every record on the html page and yields
       a HEPRecord.


    Example usage:
    .. code-block:: console

        scrapy crawl phenix
        scrapy crawl phenix -a source_file=file://`pwd`/tests/responses/phenix/test_list.html -s "JSON_OUTPUT_DIR=tmp/"

    Happy crawling!
    """

    name = 'phenix'
    start_urls = ["http://www.phenix.bnl.gov/WWW/talk/theses.php"]
    domain = "http://www.phenix.bnl.gov"
    iterator = "html"
    itertag = "//table//td/ul/li"

    def __init__(self, source_file=None, *args, **kwargs):
        """Construct PHENIX spider"""
        super(PhenixSpider, self).__init__(*args, **kwargs)
        self.source_file = source_file

[docs]    def start_requests(self):
        """You can also run the spider on local test files"""
        if self.source_file:
            yield Request(self.source_file)
        elif self.start_urls:
            for url in self.start_urls:
                yield Request(url)

    @staticmethod
[docs]    def parse_datablock(node):
        """Get data out of the text block where there's
        title, affiliation and year
        """
        datablock = node.xpath("./text()").extract()[0]
        datalist = datablock.strip().split(",")

        thesis_type = None
        if "Ph.D." in datablock:
            thesis_type = "PhD"

        title = datablock.split('"')[1]
        datalist = [el for el in datalist if "archive" not in el]
        year = datalist.pop().strip()
        affline = datalist.pop().strip()
        stop_words = {"Ph.D.", "Master", "thesis", "at"}
        affiliation = " ".join(
            [w for w in affline.split() if w not in stop_words])

        return title, year, affiliation, thesis_type

[docs]    def get_authors(self, node):
        """Return authors dictionary """
        author = node.xpath("./b/text()").extract()
        authors = []
        _, _, affiliation, _ = self.parse_datablock(node)

        for aut in author:
            authors.append({
                'raw_name': aut,
                'affiliations': [{"value": affiliation}]
            })

        return authors

[docs]    def add_fft_file(self, pdf_files, file_access, file_type):
        """Create a structured dictionary and add to 'files' item."""
        file_dicts = []
        for link in pdf_files:
            file_dict = {
                "access": file_access,
                "description": self.name.title(),
                "url": urljoin(self.domain, link),
                "type": file_type,
            }
            file_dicts.append(file_dict)
        return file_dicts

[docs]    def parse_node(self, response, node):
        """Parse PHENIX web page into a HEP record."""
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        title, year, _, thesis_type = self.parse_datablock(node)

        if not thesis_type:
            return None

        pdf_files = node.xpath(".//a/@href").extract()
        record.add_value('additional_files', self.add_fft_file(pdf_files, "HIDDEN", "Fulltext"))
        record.add_value('authors', self.get_authors(node))
        record.add_value('date_published', year)
        record.add_value('thesis', {'degree_type': thesis_type})
        record.add_value('title', title)
        record.add_value('urls', self.start_urls)
        record.add_value('source', 'PHENIX')
        record.add_value('collections', ['HEP', 'THESIS'])

        return record.load_item()
Source code for hepcrawl.spiders.phenix_spider

HEPCrawl

Navigation

Related Topics