Source code for hepcrawl.spiders.phil_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for Philpapers.org"""

from __future__ import absolute_import, print_function

import json
from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import CrawlSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import parse_domain, get_mime_type


[docs]class PhilSpider(CrawlSpider): """Phil crawler Scrapes theses metadata from Philpapers.org JSON file. 1. parse() iterates through every record on the JSON file and yields a HEPRecord (or a request to scrape for the pdf file if link exists). Example usage: .. code-block:: console scrapy crawl phil -s "JSON_OUTPUT_DIR=tmp/" scrapy crawl phil -a source_file=file://`pwd`/tests/responses/phil/test_thesis.json -s "JSON_OUTPUT_DIR=tmp/" Happy crawling! """ # TODO: Have to check if new records are appended to the file or if the file # is just replaced with new information. Actually some old records are # removed while new ones are added? name = 'phil' start_urls = ["http://philpapers.org/philpapers/raw/export/inspire.json"] def __init__(self, source_file=None, *args, **kwargs): """Construct Phil spider.""" super(PhilSpider, self).__init__(*args, **kwargs) self.source_file = source_file
[docs] def start_requests(self): """You can also run the spider on local test files.""" if self.source_file: yield Request(self.source_file) elif self.start_urls: for url in self.start_urls: yield Request(url)
[docs] def get_authors(self, author_element): """Parses the line where there are data about the author(s).""" authors = [] for auth in author_element: authors.append({'raw_name': auth}) return authors
[docs] def get_date(self, record): """Return a standard format date. YYYY-MM-DD, YYYY-MM or YYYY. """ date_raw = record['year'].split("/") if len(date_raw) == 1: date_published = date_raw[0] elif len(date_raw) == 2: date_published = date_raw[-1] + "-" + date_raw[0] elif len(date_raw) == 3: date_published = date_raw[-1] + "-" + date_raw[1] + "-" + date_raw[0] return date_published
[docs] def parse(self, response): """Parse Philpapers JSON file into a HEP record.""" jsonresponse = json.loads(response.body_as_unicode()) for jsonrecord in jsonresponse: urls_in_record = jsonrecord.get("links") if urls_in_record: link = urls_in_record[0] request = Request(link, callback=self.scrape_for_pdf) request.meta["urls"] = urls_in_record request.meta["jsonrecord"] = jsonrecord yield request else: response.meta["urls"] = [] request.meta["jsonrecord"] = jsonrecord yield self.build_item(response)
[docs] def scrape_for_pdf(self, response): """Scrape splash page for any links to PDFs. If direct link didn't exists, parse_node() will yield a request here to scrape the urls. This will find a direct pdf link from a splash page, if it exists. Then it will ask build_item to build the HEPrecord. """ pdf_links = [] all_links = response.xpath( "//a[contains(@href, 'pdf')]/@href").extract() # Take only pdf-links, join relative urls with domain, # and remove possible duplicates: domain = parse_domain(response.url) all_links = sorted(list(set( [urljoin(domain, link) for link in all_links if "jpg" not in link.lower()]))) for link in all_links: # Extract only links with pdf in them (checks also headers): pdf = "pdf" in get_mime_type(link) or "pdf" in link.lower() if pdf and "jpg" not in link.lower(): pdf_links.append(urljoin(domain, link)) response.meta["direct_links"] = pdf_links response.meta["urls"] = response.meta.get('urls') response.meta["jsonrecord"] = response.meta.get('jsonrecord') return self.build_item(response)
[docs] def build_item(self, response): """Build the final record.""" jsonrecord = response.meta.get('jsonrecord') record = HEPLoader( item=HEPRecord(), selector=jsonrecord, response=response) record.add_value('title', jsonrecord['title']) record.add_value('abstract', jsonrecord['abstract']) record.add_value('dois', jsonrecord['doi']) record.add_value('page_nr', jsonrecord['pages']) record.add_value('authors', self.get_authors(jsonrecord['authors'])) record.add_value('file_urls', response.meta.get("direct_links")) record.add_value('urls', jsonrecord['links']) record.add_value('source', "Philpapers.org") if not jsonrecord.get('year') == "forthcoming": record.add_value('date_published', self.get_date(jsonrecord)) type_thesis = "thesis" in jsonrecord.get('pub_type').lower() info_diss = "dissertation" in jsonrecord.get('pubInfo').lower() if type_thesis or info_diss: record.add_value('collections', ['THESIS']) elif "journal" in jsonrecord.get('pub_type').lower(): record.add_value('journal_title', jsonrecord['journal']) if not jsonrecord.get('volume') == "0": record.add_value('journal_volume', jsonrecord['volume']) if not jsonrecord.get('issue') == "0": record.add_value('journal_issue', jsonrecord['issue']) if not jsonrecord.get('year') == "forthcoming": record.add_value('journal_year', int(jsonrecord['year'])) return record.load_item()