Source code for hepcrawl.spiders.dnb_spider

# -*- coding: utf-8 -*-
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for DNB Dissonline."""

from __future__ import absolute_import, print_function

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_mime_type, parse_domain, get_node

[docs]class DNBSpider(XMLFeedSpider): """DNB crawler Scrapes Deutsche National Bibliotek metadata XML files one at a time. The actual files should be retrieved from DNB viat its OAI interface. The file can contain multiple records. This spider harvests only theses. This spider takes DNB metadata records which are stored in an XML file. 1. The spider will parse the local MARC21XML format file for record data 2. If a link to the original repository splash page exists, parse_node will yield a request to scrape for abstract. This will only be done to a few selected repositories (at least for now). 3. Finally a HEPRecord will be created in `build_item`. Example usage: .. code-block:: console scrapy crawl DNB -a source_file=file://`pwd`/tests/responses/dnb/test_1.xml -s "JSON_OUTPUT_DIR=tmp/" Happy crawling! """ # TODO: OAI harvester should fetch also DDC 520 theses, not only 530. name = 'DNB' start_urls = [] iterator = 'xml' # Needed for proper namespace handling itertag = 'slim:record' download_delay = 5 # Is this a good value and how to make this domain specific? namespaces = [ ("OAI-PMH", ""), ("slim", ""), ] def __init__(self, source_file=None, *args, **kwargs): """Construct DNB spider.""" super(DNBSpider, self).__init__(*args, **kwargs) self.source_file = source_file
[docs] def start_requests(self): """Default starting point for scraping shall be the local XML file.""" yield Request(self.source_file)
[docs] def get_affiliations(node): """ Cleans the affiliation element.""" affiliations_raw = node.xpath( "./slim:datafield[@tag='502']/slim:subfield[@code='a']/text()").extract() affiliations = [] for aff_raw in affiliations_raw: arlist = aff_raw.split(",") aff = ",".join([i for i in arlist if not ("diss" in i.lower() or i.strip().isdigit())]) affiliations.append(aff) return affiliations
[docs] def get_authors(self, node): """Gets the authors.""" authors_raw = node.xpath( "./slim:datafield[@tag='100']/slim:subfield[@code='a']/text()").extract() affiliations = self.get_affiliations(node) authors = [] for author in authors_raw: authors.append({ 'raw_name': author, 'affiliations': [{"value": aff} for aff in affiliations], }) return authors
[docs] def get_thesis_supervisors(node): """Create a structured supervisor dictionary.""" supervisors_raw = node.xpath( "./slim:datafield[@tag='700'][slim:subfield[@code='e'][contains(text(), 'Betreuer')]]/slim:subfield[@code='a']/text()").extract() supervisors = [] for supervisor in supervisors_raw: supervisors.append({ 'raw_name': supervisor, }) return supervisors
[docs] def get_urls_in_record(node): """Return all the different urls in the xml.""" urls_in_record = node.xpath( "./slim:datafield[@tag='856']/slim:subfield[@code='u']/text()").extract() return urls_in_record
[docs] def parse_node(self, response, node): """Iterate through all the record nodes in the XML. With each node it checks if splash page link exists, and sends a request to scrape the abstract or calls `build_item` to build the HEPrecord. """ urls_in_record = self.get_urls_in_record(node) direct_links, splash_links = self.find_direct_links(urls_in_record) if not splash_links: response.meta["urls"] = urls_in_record response.meta["record"] = node.extract() if direct_links: response.meta["direct_links"] = direct_links return self.build_item(response) link = splash_links[0] request = Request(link, callback=self.scrape_for_abstract) request.meta["urls"] = urls_in_record request.meta["record"] = node.extract() if direct_links: request.meta["direct_links"] = direct_links return request
[docs] def scrape_for_abstract(self, response): """Scrape splash page for abstracts. If splash page link exists, `parse_node` will yield a request here to scrape the abstract (and page number). Note that all the splash pages are different. Then it will ask `build_item` to build the HEPrecord. """ node = response.selector domain = parse_domain(response.url) page_nr = [] abstract_raw = [] if ("" in domain or "" in domain): abstract_raw = node.xpath( "//span[@class='abstractFull']/pre/text()").extract() page_nr = node.xpath( "//tr[./th[contains(text(), 'Pagenumber')]]/td/text()").extract() elif "" in domain: abstract_raw = node.xpath( "//div[@id='abstract']//li/text()").extract() elif "" in domain: abstract_raw = node.xpath( "//div[@class='simple-item-view-abstract']/span/text()").extract() elif "" in domain: abstract_raw = node.xpath(".//text()[contains(.,'Zusammenfassung')" "or contains(., 'Abstract')]/ancestor::*[self::tr]/descendant::*[position() > 1]/text()").extract() elif "" in domain: abstract_raw = node.xpath( "//div[@class='ep_summary_content_main']/h2/following-sibling::p/text()").extract() # if "something else" in domain: # abstracts = node.xpath(".//somewhere[@else]") if abstract_raw: response.meta["abstract"] = [ " ".join(abstract_raw).replace("\r\n", " ")] response.meta["page_nr"] = page_nr return self.build_item(response)
[docs] def build_item(self, response): """Build the final record.""" node = get_node(response.meta["record"], self.namespaces) record = HEPLoader(item=HEPRecord(), selector=node, response=response) record.add_value('authors', self.get_authors(node)) record.add_xpath('title', "./slim:datafield[@tag='245']/slim:subfield[@code='a']/text()") record.add_xpath('source', "./slim:datafield[@tag='264']/slim:subfield[@code='b']/text()") record.add_xpath('date_published', "./slim:datafield[@tag='264']/slim:subfield[@code='c']/text()") record.add_value('thesis_supervisor', self.get_thesis_supervisors(node)) record.add_xpath( 'language', "./slim:datafield[@tag='041']/slim:subfield[@code='a']/text()") record.add_value('urls', response.meta.get('urls')) record.add_value('file_urls', response.meta.get("direct_links")) record.add_value('abstract', response.meta.get("abstract")) record.add_value('page_nr', response.meta.get("page_nr")) record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) return record.load_item()