Source code for auromat.util.url

# Copyright European Space Agency, 2013

from __future__ import absolute_import, print_function

from six import reraise
from six.moves.urllib.error import HTTPError
from six.moves.urllib.request import urlopen
import shutil
import os
import json
import sys

# monkey-patch HTTPError and add a __repr__ method
# so that it doesn't display as 'HTTPError()' but like __str__
# as e.g. 'HTTP Error 500: msg'
HTTPError.__repr__ = HTTPError.__str__

[docs]def urlResponseCode(url):
    """
    Return the response code of the server without downloading the
    actual data.
    """
    try:
        code = _urlResponseCode(url)
    except: # try again once in case of network problems
        code = _urlResponseCode(url)
    return code

def _urlResponseCode(url):
    try:
        connection = urlopen(url)
        code = connection.getcode()
        connection.close()
    except HTTPError as e:
        code = e.getcode()
    return code

[docs]class DownloadError(Exception):
    pass

[docs]def downloadFile(url, path, unifyErrors=True):
    """
    Download a single resource and store it as file to disk.
    On download errors (except 404), the download is retried once,
    after that an exception is raised.
    """
    def saveToDisk(req):
        tmpPath = path + '.tmp'
        with open(tmpPath,'wb') as fp:
            shutil.copyfileobj(req, fp)
        os.rename(tmpPath, path)
    downloadResource(url, saveToDisk, unifyErrors=unifyErrors)
    
[docs]def downloadJSON(url, unifyErrors=True, data=None, **kw):
    """
    Parse and return the JSON document at the given URL.
    Any additional keywords are given to json.load unchanged.    
    """
    def asjson(req):
        return json.load(req, **kw)
    return downloadResource(url, asjson, data=data, unifyErrors=unifyErrors)

[docs]def downloadResource(url, fn, data=None, unifyErrors=True):
    """
    Download a single resource and call `fn` on it.
    On download errors (except 404), the download is retried once,
    after that an exception is raised.
    """
    retry = False
    try:
        return _downloadResource(url, fn, data=data)
    except HTTPError as e:
        if e.code == 404:
            if unifyErrors:
                raise DownloadError(e)
            else:
                raise
        else:
            retry = True
    except IOError as e:
        if unifyErrors:
            raise DownloadError(e)
        else:
            raise
    except:
        retry = True
    
    if retry:
        print('download error, retrying once')
        try:
            return _downloadResource(url, fn, data=data)
        except:
            if unifyErrors:
                _, e, tb = sys.exc_info()
                new_exc = DownloadError('{}: {}'.format(e.__class__.__name__, e))
                reraise(new_exc.__class__, new_exc, tb)
            else:
                raise
    
def _downloadResource(url, fn, data=None):
    print('downloading', url, end='')
    try:        
        req = urlopen(url, data=data) # throws also on 404
        res = fn(req)
        print('-> done')
        return res
    except Exception as e:
        # 404, network problem, IO error, ...
        print('->', str(e))
        raise
        
[docs]def downloadFiles(urls, paths, retFailures=False):
    """
    Downloads multiple resources and stores them to disk at the given `paths`,
    ignoring already existing files
    on disk. On download errors (except 404), the download is retried once.
    If retFailures is False, then True is returned if all files
    could be downloaded successfully, otherwise False.
    If retFailures is True, then a tuple (bool, failures) is
    returned which additionally contains the urls (with exceptions)
    that couldn't be downloaded.
    """
    failures = []
    for url, path in zip(urls, paths):
        if os.path.exists(path):
            continue
        try:
            downloadFile(url, path, unifyErrors=False)
        except Exception as e:
            failures.append((url, e))
                
    if retFailures:
        return len(failures) == 0, failures
    else:
        return len(failures) == 0