Source code for hepcrawl.spiders.alpha_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for ALPHA."""

from __future__ import absolute_import, print_function

import re

from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import CrawlSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import has_numbers


[docs]class AlphaSpider(CrawlSpider):

    """Alpha crawler
    Scrapes theses metadata from Alpha experiment web page.
    http://alpha.web.cern.ch/publications#thesis

    1. parse() iterates through every record on the html page and yields
       a HEPRecord.


    Example usage:
    .. code-block:: console

        scrapy crawl alpha -s "JSON_OUTPUT_DIR=tmp/"
        scrapy crawl alpha -a source_file=file://`pwd`/tests/responses/alpha/test_1.htm -s "JSON_OUTPUT_DIR=tmp/"

    Happy crawling!
    """

    name = 'alpha'
    start_urls = ["http://alpha.web.cern.ch/publications#thesis"]
    domain = "http://alpha.web.cern.ch/"
    itertag = "//div[@class = 'node node-thesis']"

    def __init__(self, source_file=None, *args, **kwargs):
        """Construct Alpha spider"""
        super(AlphaSpider, self).__init__(*args, **kwargs)
        self.source_file = source_file

[docs]    def start_requests(self):
        """You can also run the spider on local test files"""
        if self.source_file:
            yield Request(self.source_file)
        elif self.start_urls:
            for url in self.start_urls:
                yield Request(url)

[docs]    def parse_author_data(self, thesis):
        """Parses the line where there are data about the author(s)"""

        author_line = thesis.xpath(
            "./div[@class = 'content clearfix']//div[@class='field-item even']"
            "/p[contains(text(),'Thesis')]/text()"
        ).extract()
        author_list = re.sub(r'[\n\t\xa0]', '', author_line[0]).split(
            ",")  # Author name might contain unwanted characters.
        author = author_list[0]

        year = ''
        thesis_type = ''
        affiliation = ''
        for i in author_list:
            if "thesis" in i.lower():
                thesis_type = re.sub(r"thesis|Thesis", "", i).strip()
            if "university" in i.lower():
                affiliation = re.sub(r"[^A-Za-z\s]+", '', i).strip()
            if has_numbers(i):
                # Affiliation element might include the year
                year = re.findall(r'\d+', i)[0].strip()

        authors = [{
            'raw_name': author,
            'affiliations': [{"value": affiliation}]
        }]

        return authors, thesis_type, year

[docs]    def get_abstract(self, thesis):
        """Returns a unified abstract, if divided to multiple paragraphs.
        """
        abs_paragraphs = thesis.xpath(
            "./div[@class = 'content clearfix']//div[@class='field-item even']"
            "/p[normalize-space()][string-length(text()) > 0][position() < last()]/text()"
        ).extract()
        whole_abstract = " ".join(abs_paragraphs)
        return whole_abstract

    def get_title(self, node):
        title = node.xpath(
            "./div[@class = 'node-headline clearfix']//a/text()").extract()
        rel_url = node.xpath(
            "./div[@class = 'node-headline clearfix']//a/@href").extract()
        urls = [urljoin(self.domain, rel_url[0])]
        return title, urls

[docs]    def parse(self, response):
        """Parse Alpha web page into a HEP record."""

        # Random <br>'s will create problems
        response = response.replace(body=response.body.replace('<br />', ''))
        node = response.selector

        for thesis in node.xpath(self.itertag):
            record = HEPLoader(
                item=HEPRecord(), selector=thesis, response=response)

            authors, thesis_type, year = self.parse_author_data(thesis)

            if "phd" not in thesis_type.lower():
                continue

            record.add_value('authors', authors)
            record.add_value('date_published', year)
            record.add_value('thesis', {'degree_type': thesis_type})

            title, urls = self.get_title(thesis)
            record.add_value('title', title)
            record.add_value('urls', urls)

            abstract = self.get_abstract(thesis)
            record.add_value("abstract", abstract)

            record.add_xpath(
                'file_urls', "./div[@class = 'content clearfix']//span[@class='file']/a/@href")
            record.add_value('source', 'Alpha experiment')
            record.add_value('collections', ['HEP', 'THESIS'])

            yield record.load_item()
Source code for hepcrawl.spiders.alpha_spider

HEPCrawl

Navigation

Related Topics