Source code for hepcrawl.spiders.alpha_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for ALPHA."""

from __future__ import absolute_import, print_function

import re

from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import CrawlSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import has_numbers


[docs]class AlphaSpider(CrawlSpider): """Alpha crawler Scrapes theses metadata from Alpha experiment web page. http://alpha.web.cern.ch/publications#thesis 1. parse() iterates through every record on the html page and yields a HEPRecord. Example usage: .. code-block:: console scrapy crawl alpha -s "JSON_OUTPUT_DIR=tmp/" scrapy crawl alpha -a source_file=file://`pwd`/tests/responses/alpha/test_1.htm -s "JSON_OUTPUT_DIR=tmp/" Happy crawling! """ name = 'alpha' start_urls = ["http://alpha.web.cern.ch/publications#thesis"] domain = "http://alpha.web.cern.ch/" itertag = "//div[@class = 'node node-thesis']" def __init__(self, source_file=None, *args, **kwargs): """Construct Alpha spider""" super(AlphaSpider, self).__init__(*args, **kwargs) self.source_file = source_file
[docs] def start_requests(self): """You can also run the spider on local test files""" if self.source_file: yield Request(self.source_file) elif self.start_urls: for url in self.start_urls: yield Request(url)
[docs] def parse_author_data(self, thesis): """Parses the line where there are data about the author(s)""" author_line = thesis.xpath( "./div[@class = 'content clearfix']//div[@class='field-item even']" "/p[contains(text(),'Thesis')]/text()" ).extract() author_list = re.sub(r'[\n\t\xa0]', '', author_line[0]).split( ",") # Author name might contain unwanted characters. author = author_list[0] year = '' thesis_type = '' affiliation = '' for i in author_list: if "thesis" in i.lower(): thesis_type = re.sub(r"thesis|Thesis", "", i).strip() if "university" in i.lower(): affiliation = re.sub(r"[^A-Za-z\s]+", '', i).strip() if has_numbers(i): # Affiliation element might include the year year = re.findall(r'\d+', i)[0].strip() authors = [{ 'raw_name': author, 'affiliations': [{"value": affiliation}] }] return authors, thesis_type, year
[docs] def get_abstract(self, thesis): """Returns a unified abstract, if divided to multiple paragraphs. """ abs_paragraphs = thesis.xpath( "./div[@class = 'content clearfix']//div[@class='field-item even']" "/p[normalize-space()][string-length(text()) > 0][position() < last()]/text()" ).extract() whole_abstract = " ".join(abs_paragraphs) return whole_abstract
def get_title(self, node): title = node.xpath( "./div[@class = 'node-headline clearfix']//a/text()").extract() rel_url = node.xpath( "./div[@class = 'node-headline clearfix']//a/@href").extract() urls = [urljoin(self.domain, rel_url[0])] return title, urls
[docs] def parse(self, response): """Parse Alpha web page into a HEP record.""" # Random <br>'s will create problems response = response.replace(body=response.body.replace('<br />', '')) node = response.selector for thesis in node.xpath(self.itertag): record = HEPLoader( item=HEPRecord(), selector=thesis, response=response) authors, thesis_type, year = self.parse_author_data(thesis) if "phd" not in thesis_type.lower(): continue record.add_value('authors', authors) record.add_value('date_published', year) record.add_value('thesis', {'degree_type': thesis_type}) title, urls = self.get_title(thesis) record.add_value('title', title) record.add_value('urls', urls) abstract = self.get_abstract(thesis) record.add_value("abstract", abstract) record.add_xpath( 'file_urls', "./div[@class = 'content clearfix']//span[@class='file']/a/@href") record.add_value('source', 'Alpha experiment') record.add_value('collections', ['HEP', 'THESIS']) yield record.load_item()