Source code for inspire_crawler.models

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2016 CERN.
#
# INSPIRE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Models for crawler integration."""

from __future__ import absolute_import, print_function

from datetime import datetime

from enum import Enum

from invenio_db import db

from sqlalchemy_utils.types import ChoiceType, UUIDType
from sqlalchemy.orm.exc import NoResultFound

from invenio_workflows.models import WorkflowObjectModel

from .errors import CrawlerJobNotExistError


class JobStatus(Enum):
    """Constants for possible status of any given PID."""

    __order__ = 'PENDING RUNNING FINISHED UNKNOWN'

    PENDING = 'pending'
    RUNNING = 'running'
    ERROR = 'error'
    FINISHED = 'finished'
    UNKNOWN = ''

    def __init__(self, value):
        """Hack."""

    def __eq__(self, other):
        """Equality test."""
        return self.value == other

    def __str__(self):
        """Return its value."""
        return self.value


[docs]class CrawlerJob(db.Model): """Keeps track of submitted crawler jobs.""" __tablename__ = 'crawler_job' id = db.Column(db.Integer, primary_key=True, autoincrement=True) job_id = db.Column(UUIDType, index=True) spider = db.Column(db.String(255), index=True) workflow = db.Column(db.String(255), index=True) results = db.Column(db.Text, nullable=True) status = db.Column(ChoiceType(JobStatus, impl=db.String(10)), nullable=False) logs = db.Column(db.Text, nullable=True) scheduled = db.Column(db.DateTime, default=datetime.now, nullable=False, index=True) @classmethod
[docs] def create(cls, job_id, spider, workflow, results=None, logs=None, status=JobStatus.PENDING): """Create a new entry for a scheduled crawler job.""" obj = cls(job_id=job_id, spider=spider, workflow=workflow, results=results, logs=logs, status=status) db.session.add(obj)
@classmethod
[docs] def get_by_job(cls, job_id): """Get a row by Job UUID.""" try: return cls.query.filter_by( job_id=job_id ).one() except NoResultFound: raise CrawlerJobNotExistError(job_id)
[docs] def save(self): """Save object to persistent storage.""" with db.session.begin_nested(): db.session.add(self)
[docs]class CrawlerWorkflowObject(db.Model): """Relation between a job and workflow objects.""" __tablename__ = "crawler_workflows_object" job_id = db.Column(UUIDType, primary_key=True) object_id = db.Column( db.Integer, db.ForeignKey( WorkflowObjectModel.id, ondelete="CASCADE", onupdate="CASCADE", ), primary_key=True )
__all__ = ( 'CrawlerJob', 'CrawlerWorkflowObject', )