Source code for hepcrawl.spiders.infn_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for INFN."""

from __future__ import absolute_import, print_function

from urlparse import urljoin

import datetime
import requests

from scrapy.http import Request
from scrapy.spiders import XMLFeedSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_temporary_file

from ..dateutils import format_date


[docs]class InfnSpider(XMLFeedSpider): """INFN crawler Scrapes theses metadata from INFN web page. http://www.infn.it/thesis/index.php 1. If not local html file given, `get_list_file` gets one using POST requests. Year is given as a argument, default is current year. 2. parse_node() iterates through every record on the html page. 3. If no pdf links are found, request to scrape the splash page is returned. 4. In the end, a HEPRecord is built. Example usage: .. code-block:: console scrapy crawl infn scrapy crawl infn -a source_file=file://`pwd`/tests/responses/infn/test_1.html -s "JSON_OUTPUT_DIR=tmp/" scrapy crawl infn -a year=1999 -s "JSON_OUTPUT_DIR=tmp/" Happy crawling! """ name = 'infn' start_urls = ["http://www.infn.it/thesis/index.php"] domain = "http://www.infn.it/thesis/" iterator = "html" itertag = "//tr[@onmouseover]" today = str(datetime.date.today().year) def __init__(self, source_file=None, year=today, *args, **kwargs): """Construct INFN spider""" super(InfnSpider, self).__init__(*args, **kwargs) self.source_file = source_file self.year = year
[docs] def start_requests(self): """You can also run the spider on local test files""" if self.source_file: yield Request(self.source_file) elif self.start_urls: html_file = self.get_list_file(self.year) yield Request(html_file)
[docs] def get_list_file(self, year): """Get data out of the query web page and save it locally.""" post_data = { # Default is to fetch the current year. "TESI[data_conseguimentoyy]": year, "TESI[tesi_tipo]": "1", # Dottoral "TESI[paginazione]": "0", # All results } url = self.start_urls[0] req = requests.post(url, data=post_data) listing_file = get_temporary_file(prefix="infn_", suffix=".html") with open(listing_file, "w") as outfile: outfile.write(req.text) call_uri = u"file://{0}".format(listing_file) return call_uri
@staticmethod def _fix_node_text(text_nodes): """Join text split to multiple elements. Also clean unwantend whitespaces. Input must be a list. Returns a string. """ title = " ".join(" ".join(text_nodes).split()) return title
[docs] def get_authors(self, node): """Return authors dictionary """ authors = [] given_names_raw = node.xpath( "//tr//span[@id='autore_nome_text']/text()").extract() surname_raw = node.xpath( "//tr//span[@id='autore_cognome_text']/text()").extract() university = node.xpath( u"//tr/td[contains(text(), 'Universit\xe0')]/following-sibling::td/text()").extract() authdict = {} if given_names_raw: authdict["given_names"] = self._fix_node_text(given_names_raw) if surname_raw: authdict["surname"] = self._fix_node_text(surname_raw) if university: authdict["affiliations"] = [{"value": self._fix_node_text(university)}] authors.append(authdict) return authors
[docs] def add_fft_file(self, pdf_files, file_access, file_type): """Create a structured dictionary to add to 'files' item.""" file_dicts = [] for link in pdf_files: file_dict = { "access": file_access, "description": self.name.title(), "url": urljoin(self.domain, link), "type": file_type, } file_dicts.append(file_dict) return file_dicts
[docs] def get_thesis_info(self, node): """Create thesis info dictionary.""" date_raw = node.xpath( u"//tr/td[contains(text(), 'Data conseguimento')]/following-sibling::td/text()").extract() university = node.xpath( u"//tr/td[contains(text(), 'Universit\xe0')]/following-sibling::td/text()").extract() thesis = { "date": format_date(self._fix_node_text(date_raw)), "institutions": [{"name": self._fix_node_text(university)}], "degree_type": "PhD", } return thesis
@staticmethod
[docs] def get_thesis_supervisors(node): """Create a structured supervisor dictionary.""" supervisors_raw = node.xpath(u"//tr/td[contains(text(), 'Relatore/i')]/following-sibling::td/text()").extract() supervisors = [] for supervisor in supervisors_raw: supervisor = " ".join(supervisor.split()) supervisors.append({ 'raw_name': supervisor, }) return supervisors
[docs] def parse_node(self, response, node): """Parse INFN web page into a HEP record.""" pdf_links = [] splash_link = '' all_links = node.xpath(".//a/@href").extract() for link in all_links: if "thesis_dettaglio.php" in link: splash_link = urljoin(self.domain, link) if "pdf" in link: pdf_links.append(link) if splash_link: request = Request(splash_link, callback=self.scrape_splash) request.meta["splash_link"] = splash_link if pdf_links: request.meta["pdf_links"] = pdf_links yield request elif pdf_links: response.meta["pdf_links"] = pdf_links yield self.build_item(response)
[docs] def scrape_splash(self, response): """Scrape INFN web page for more metadata.""" node = response.selector thesis_type = node.xpath( u"//tr/td[contains(text(), 'Tipo')]/following-sibling::td/text()" ).extract_first() if "dottorato" not in thesis_type.lower(): return None date_published = node.xpath( "//tr[./th[contains(text(), 'aggiornamento')]]/td/text()").extract() experiment = node.xpath( "//tr[./th[contains(text(), 'Esperimento')]]/td/text()").extract_first() titles = node.xpath( u"//tr/td[contains(text(), 'Titolo')]/following-sibling::td/text()").extract() abstracts = node.xpath( u"//tr/td[contains(text(), 'Abstract')]/following-sibling::td/text()").extract() if "pdf_links" not in response.meta: response.meta["pdf_links"] = node.xpath(u"//tr/td/a/@href").extract() response.meta["thesis_info"] = self.get_thesis_info(node) response.meta["date_published"] = self._fix_node_text(date_published) response.meta["authors"] = self.get_authors(node) response.meta["experiment"] = experiment response.meta["titles"] = titles response.meta["abstract"] = abstracts response.meta["supervisors"] = self.get_thesis_supervisors(node) return self.build_item(response)
[docs] def build_item(self, response): """Build the final HEPRecord item.""" node = response.selector record = HEPLoader(item=HEPRecord(), selector=node, response=response) pdf_files = response.meta.get("pdf_links") if pdf_files: record.add_value('additional_files', self.add_fft_file(pdf_files, "HIDDEN", "Fulltext")) record.add_value('authors', response.meta.get("authors")) record.add_value('date_published', response.meta.get("date_published")) record.add_value('thesis', response.meta.get("thesis_info")) record.add_value('thesis_supervisor', response.meta.get("supervisors")) record.add_value('title', response.meta.get("titles")) record.add_value('urls', response.meta.get("splash_link")) record.add_value('abstract', response.meta.get("abstract")) record.add_value('source', 'INFN') record.add_value('collections', ['HEP', 'THESIS']) return record.load_item()