Source code for hepcrawl.spiders.phil_spider

# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for Philpapers.org"""

from __future__ import absolute_import, print_function

import json
from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import CrawlSpider

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import parse_domain, get_mime_type


[docs]class PhilSpider(CrawlSpider):

    """Phil crawler
    Scrapes theses metadata from Philpapers.org JSON file.

    1. parse() iterates through every record on the JSON file and yields
       a HEPRecord (or a request to scrape for the pdf file if link exists).


    Example usage:
    .. code-block:: console

        scrapy crawl phil -s "JSON_OUTPUT_DIR=tmp/"
        scrapy crawl phil -a source_file=file://`pwd`/tests/responses/phil/test_thesis.json -s "JSON_OUTPUT_DIR=tmp/"

    Happy crawling!
    """

    # TODO: Have to check if new records are appended to the file or if the file
    # is just replaced with new information. Actually some old records are
    # removed while new ones are added?

    name = 'phil'
    start_urls = ["http://philpapers.org/philpapers/raw/export/inspire.json"]

    def __init__(self, source_file=None, *args, **kwargs):
        """Construct Phil spider."""
        super(PhilSpider, self).__init__(*args, **kwargs)
        self.source_file = source_file

[docs]    def start_requests(self):
        """You can also run the spider on local test files."""
        if self.source_file:
            yield Request(self.source_file)
        elif self.start_urls:
            for url in self.start_urls:
                yield Request(url)

[docs]    def get_authors(self, author_element):
        """Parses the line where there are data about the author(s)."""
        authors = []
        for auth in author_element:
            authors.append({'raw_name': auth})
        return authors

[docs]    def get_date(self, record):
        """Return a standard format date.

        YYYY-MM-DD, YYYY-MM or YYYY.
        """
        date_raw = record['year'].split("/")
        if len(date_raw) == 1:
            date_published = date_raw[0]
        elif len(date_raw) == 2:
            date_published = date_raw[-1] + "-" + date_raw[0]
        elif len(date_raw) == 3:
            date_published = date_raw[-1] + "-" + date_raw[1] + "-" + date_raw[0]

        return date_published

[docs]    def parse(self, response):
        """Parse Philpapers JSON file into a HEP record."""

        jsonresponse = json.loads(response.body_as_unicode())
        for jsonrecord in jsonresponse:
            urls_in_record = jsonrecord.get("links")
            if urls_in_record:
                link = urls_in_record[0]
                request = Request(link, callback=self.scrape_for_pdf)
                request.meta["urls"] = urls_in_record
                request.meta["jsonrecord"] = jsonrecord
                yield request
            else:
                response.meta["urls"] = []
                request.meta["jsonrecord"] = jsonrecord
                yield self.build_item(response)

[docs]    def scrape_for_pdf(self, response):
        """Scrape splash page for any links to PDFs.

        If direct link didn't exists, parse_node() will yield a request
        here to scrape the urls. This will find a direct pdf link from a
        splash page, if it exists. Then it will ask build_item to build the
        HEPrecord.
        """
        pdf_links = []
        all_links = response.xpath(
            "//a[contains(@href, 'pdf')]/@href").extract()
        # Take only pdf-links, join relative urls with domain,
        # and remove possible duplicates:
        domain = parse_domain(response.url)
        all_links = sorted(list(set(
            [urljoin(domain, link) for link in all_links if "jpg" not in link.lower()])))
        for link in all_links:
            # Extract only links with pdf in them (checks also headers):
            pdf = "pdf" in get_mime_type(link) or "pdf" in link.lower()
            if pdf and "jpg" not in link.lower():
                pdf_links.append(urljoin(domain, link))

        response.meta["direct_links"] = pdf_links
        response.meta["urls"] = response.meta.get('urls')
        response.meta["jsonrecord"] = response.meta.get('jsonrecord')
        return self.build_item(response)

[docs]    def build_item(self, response):
        """Build the final record."""
        jsonrecord = response.meta.get('jsonrecord')
        record = HEPLoader(
            item=HEPRecord(), selector=jsonrecord, response=response)

        record.add_value('title', jsonrecord['title'])
        record.add_value('abstract', jsonrecord['abstract'])
        record.add_value('dois', jsonrecord['doi'])
        record.add_value('page_nr', jsonrecord['pages'])
        record.add_value('authors', self.get_authors(jsonrecord['authors']))
        record.add_value('file_urls', response.meta.get("direct_links"))
        record.add_value('urls', jsonrecord['links'])
        record.add_value('source', "Philpapers.org")
        if not jsonrecord.get('year') == "forthcoming":
            record.add_value('date_published', self.get_date(jsonrecord))
        type_thesis = "thesis" in jsonrecord.get('pub_type').lower()
        info_diss = "dissertation" in jsonrecord.get('pubInfo').lower()
        if type_thesis or info_diss:
            record.add_value('collections', ['THESIS'])
        elif "journal" in jsonrecord.get('pub_type').lower():
            record.add_value('journal_title', jsonrecord['journal'])
            if not jsonrecord.get('volume') == "0":
                record.add_value('journal_volume', jsonrecord['volume'])
            if not jsonrecord.get('issue') == "0":
                record.add_value('journal_issue', jsonrecord['issue'])
            if not jsonrecord.get('year') == "forthcoming":
                record.add_value('journal_year', int(jsonrecord['year']))

        return record.load_item()
Source code for hepcrawl.spiders.phil_spider

HEPCrawl

Navigation

Related Topics