# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2016 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
"""Spider for Brown University Digital Repository"""
from __future__ import absolute_import, print_function
import re
import json
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import split_fullname, parse_domain, get_mime_type
[docs]class BrownSpider(CrawlSpider):
"""Brown crawler
Scrapes theses metadata from Brown Digital Repository JSON file
https://repository.library.brown.edu/api/collections/355/
Browse the dissertations:
https://repository.library.brown.edu/studio/collections/id_355/
1. parse() iterates through every record on the JSON file and yields
a HEPRecord (or a request to scrape for the pdf file if link exists).
Example usage:
.. code-block:: console
scrapy crawl brown -s "JSON_OUTPUT_DIR=tmp/"
scrapy crawl brown -a source_file=file://`pwd`/tests/responses/brown/test_1.json -s "JSON_OUTPUT_DIR=tmp/"
Happy crawling!
"""
# TODO:
# * Have to check how we should access the API. Right now behind the link is
# a JSON file with 100 first results from a query to Physics dissertations
# collection.
# * On the splash page there is a link to MODS format XML metadata, could use
# also this.
name = 'brown'
start_urls = ["https://repository.library.brown.edu/api/collections/355/"]
def __init__(self, source_file=None, *args, **kwargs):
"""Construct Brown spider."""
super(BrownSpider, self).__init__(*args, **kwargs)
self.source_file = source_file
[docs] def start_requests(self):
"""You can also run the spider on local test files."""
if self.source_file:
yield Request(self.source_file)
elif self.start_urls:
for url in self.start_urls:
yield Request(url)
@staticmethod
def _get_pdf_link(response):
"""Scrape splash page for links to PDFs, author name, copyright date,
thesis info and page numbers.
"""
pdf_links = []
all_links = response.xpath(
"//a[contains(@href, 'pdf') or contains(@href, 'PDF')]/@href").extract()
# Take only pdf-links, join relative urls with domain,
# and remove possible duplicates:
domain = parse_domain(response.url)
all_links = sorted(list(set(
[urljoin(domain, link) for link in all_links if "?embed" not in link])))
for link in all_links:
# Extract only links with pdf in them (checks also headers):
try:
if "pdf" in get_mime_type(link) or "pdf" in link.lower():
pdf_links.append(urljoin(domain, link))
except (ValueError, IOError):
continue
return pdf_links
@staticmethod
def _get_authors(response):
"""Get author data from the web page."""
authors = []
raw_authors = response.xpath(
"//div[@class='panel-body']/dl/dt[contains(text(), 'Contributors')]/following-sibling::dd[contains(text(), 'creator') or contains(text(), 'Creator')]/text()"
).extract()
if not raw_authors:
return authors
for auth in raw_authors:
auth = auth.replace("(creator)", "")
auth = auth.replace("(Creator)", "")
split_author = split_fullname(auth)
surname = split_author[0]
given_names = split_author[-1]
authors.append({
'surname': surname,
'given_names': given_names,
})
return authors
@staticmethod
def _get_date(response):
"""Get copyright date from the web page."""
date_raw = response.xpath(
"//div[@class='panel-body']/dl/dt[contains(text(), 'Copyright')]/following-sibling::dd[1]/text()").extract_first()
# NOTE: apparently the only real data here is the year, all dates are
# of the format "01-01-2016, 01-01-2012" etc.
return date_raw
@staticmethod
def _get_phd_year(response):
"""Parse notes and get the PhD year."""
phd_year = ""
notes_raw = response.xpath(
"//div[@class='panel-body']/dl/dt[contains(text(), 'Notes')]/following-sibling::dd[1]/text()").extract_first()
if notes_raw:
notes_raw = notes_raw.replace(".", "")
pattern = re.compile(r'[\W_]+', re.UNICODE)
notes = pattern.sub(' ', notes_raw).split()
try:
phd_year = [notes.pop(ind) for ind, val in enumerate(notes) if val.isdigit()][0]
except IndexError:
pass
return phd_year
def _get_thesis_info(self, response):
"""Create thesis info dictionary."""
return {
"date": self._get_phd_year(response),
"institutions": [{"name": "Brown University"}],
"degree_type": "PhD",
}
@staticmethod
def _get_page_num(response):
"""Get number of pages from the web page."""
page_no_raw = response.xpath(
"//div[@class='panel-body']/dl/dt[contains(text(), 'Extent')]/following-sibling::dd[1]/text()").extract_first()
if page_no_raw:
page_no = [w for w in page_no_raw.split() if w.isdigit()]
return page_no
[docs] def parse(self, response):
"""Go through every record in the JSON and. If link to splash page
exists, go scrape. If not, create a record with the available data.
"""
jsonresponse = json.loads(response.body_as_unicode())
for jsonrecord in jsonresponse["items"]["docs"]:
link = jsonrecord.get("uri")
try:
request = Request(link, callback=self.scrape_splash)
request.meta["jsonrecord"] = jsonrecord
pdf_link = link + "PDF/"
if "pdf" in get_mime_type(pdf_link):
request.meta["pdf_link"] = pdf_link
yield request
except (TypeError, ValueError, IOError):
response.meta["jsonrecord"] = jsonrecord
yield self.build_item(response)
[docs] def scrape_splash(self, response):
"""Scrape splash page for links to PDFs, author name, copyright date,
thesis info and page numbers.
"""
if "pdf_link" not in response.meta:
response.meta["pdf_link"] = self._get_pdf_link(response)
response.meta["authors"] = self._get_authors(response)
response.meta["date"] = self._get_date(response)
response.meta["thesis"] = self._get_thesis_info(response)
response.meta["pages"] = self._get_page_num(response)
return self.build_item(response)
[docs] def build_item(self, response):
"""Build the final record."""
jsonrecord = response.meta.get('jsonrecord')
record = HEPLoader(
item=HEPRecord(), selector=jsonrecord, response=response)
record.add_value('title', jsonrecord.get('primary_title'))
record.add_value('abstract', jsonrecord.get('abstract'))
record.add_value('free_keywords', jsonrecord.get('keyword'))
record.add_value('page_nr', response.meta.get("pages"))
record.add_value('authors', response.meta.get("authors"))
record.add_value('file_urls', response.meta.get("pdf_link"))
record.add_value('urls', jsonrecord.get('uri'))
record.add_value('date_published', response.meta.get("date"))
record.add_value('thesis', response.meta.get("thesis"))
record.add_value('collections', ['HEP', 'THESIS'])
return record.load_item()