Source code for pychemia.evaluator.direct_evaluator

from __future__ import print_function
import os
import socket
import time
from multiprocessing import Process
from pychemia import pcm_log, HAS_PYMONGO

if HAS_PYMONGO:
    from pychemia.db import get_database


[docs]class DirectEvaluator:

    def __init__(self, db_settings, dbnames, source_dir, is_evaluated, worker, worker_args=None, nconcurrent=1,
                 evaluate_failed=False, evaluate_all=False, sleeping_time=120):
        """
        DirectEvaluator is a class to manage the execution of a function 'worker' for entries on a list of PyChemiaDB
         databases.
        The execution of each worker occurs directly on the machine executing the code, and the class controls the
        number of concurrent execution using the variable 'nconcurrent'

        :param db_settings: Common database settings for all the databases that will be processed by this evaluator.
                            All databases should share common settings like server name, user and password, ssl and
                            replicaset settings.
        :param dbnames: List of database names, the common settings are stored in db_settings
        :param source_dir: Path to directory where the candidates will be evaluated.
        :param is_evaluated: Function to decide if one entry is evaluable
        :param worker: Function applied to each candidate on the databases that are not considered evaluated.
        :param worker_args: Arguments for the worker function. (Usually a dictionary that depends on the arguments
                            needed by worker)
        :param nconcurrent: Number of concurrent executions allowed. Their number usually should be the number of cores
                            available on the machine. (default: 1)
        :param evaluate_failed: Boolean to decide if candidates marked as failed should be evaluated again.
        :param evaluate_all: Boolean to decide is all candidates are evaluated regardless of the outcome of the function
                is_evaluated. (default: False)
        :param sleeping_time: Time in seconds before try to search for new candidates to evaluate (default: 120 seconds)
        """
        self.db_settings = db_settings
        self.dbnames = dbnames
        self.source_dir = source_dir
        self.is_evaluated = is_evaluated
        self.worker = worker
        self.worker_args = worker_args
        self.nconcurrent = nconcurrent
        self.evaluate_failed = evaluate_failed
        self.evaluate_all = evaluate_all
        self.sleeping_time = sleeping_time

[docs]    def unlock_all(self):
        """
        Checking all databases and unlocking all entries.

        :return: None
        """
        for idb in self.dbnames:
            db_settings = dict(self.db_settings)
            db_settings['name'] = idb
            pcdb = get_database(db_settings)
            print('Database contains: %d entries' % pcdb.entries.count())

            for entry in pcdb.entries.find({}):
                pcdb.unlock(entry['_id'], name=socket.gethostname())
            print('Number of entries: ', pcdb.entries.count())

[docs]    def get_list_candidates(self):
        """
        Scan all databases looking for candidates for evaluation

        :return: A list of pairs, each pair contains the name of the database
                    and candidate identifier.
        """
        ret = []
        for idb in self.dbnames:
            print(idb)
            db_settings = dict(self.db_settings)
            db_settings['name'] = idb
            pcdb = get_database(db_settings)

            for entry in pcdb.entries.find({}, {'_id': 1}):
                entry_id = entry['_id']
                if not self.is_evaluated(pcdb, entry_id, self.worker_args) or self.evaluate_all:
                    pcm_log.debug('Adding entry %s from db %s' % (str(entry_id), pcdb.name))
                    ret.append([idb, entry_id])
        print('Found %d entries to evaluate' % len(ret))
        return ret

[docs]    def run(self):
        """
        Continuously search for suitable candidates to evaluation among a list of databases.

        :return:
        """

        procs = []
        ids_running = []

        # Creating a list to store the 'nconcurrent' running jobs
        for i in range(self.nconcurrent):
            procs.append(None)
            ids_running.append(None)

        self.unlock_all()

        # Main loop looking permanently for candidates for evaluation
        while True:

            to_evaluate = self.get_list_candidates()

            index = 0
            currently_evaluating = 0
            for j in range(self.nconcurrent):
                if procs[j] is not None and procs[j].is_alive():
                    currently_evaluating += 1
            print('Candidates to evaluate: %d  Candidates in evaluation: %d' % (len(to_evaluate), currently_evaluating))

            while index < len(to_evaluate):

                db_settings = dict(self.db_settings)
                # The first component of each pair in to_evaluate is the name of the database
                dbname = to_evaluate[index][0]
                db_settings['name'] = dbname
                pcdb = get_database(db_settings)
                # The second component of each pair in to_evaluate is the entry_id
                entry_id = to_evaluate[index][1]

                for j in range(self.nconcurrent):
                    if procs[j] is None or not procs[j].is_alive():
                        ids_running[j] = None

                if entry_id in ids_running:
                    print('Already executing: %s' % entry_id)
                    index += 1
                    continue
                else:
                    print('DB: %10s Entry: %s' % (dbname, entry_id))

                if not os.path.exists(self.source_dir + os.sep + dbname):
                    os.mkdir(self.source_dir + os.sep + dbname)

                slot = None
                while True:
                    for j in range(self.nconcurrent):
                        if procs[j] is None or not procs[j].is_alive():
                            slot = j
                            break
                    if slot is None:
                        time.sleep(self.sleeping_time)
                    else:
                        break

                # The function is_evaluated needs two arguments, the database object and entry identifier and
                # must return a boolean to decide if the candidate should be evaluated.
                if not self.is_evaluated(pcdb, entry_id, self.worker_args) or self.evaluate_all:
                    pcm_log.debug('Evaluable: %s:%s. Relaxing entry %d of %d Slot: %d' % (dbname,
                                                                                          str(entry_id),
                                                                                          index,
                                                                                          len(to_evaluate), slot))
                    ids_running[slot] = entry_id
                    workdir = self.source_dir + os.sep + dbname + os.sep + str(entry_id)
                    if not os.path.exists(workdir):
                        os.mkdir(workdir)
                    pcm_log.debug('Launching for %s id: %s' % (pcdb.name, str(entry_id)))

                    # This is the actual call to the worker, it must be a function with 4 arguments:
                    # The database settings, the entry identifier, the working directory and arguments for the worker
                    procs[slot] = Process(target=self.worker, args=(db_settings, entry_id, workdir, self.worker_args))
                    procs[slot].start()
                    time.sleep(1)
                else:
                    pcm_log.debug('Not evaluable: %s' % str(entry_id))
                index += 1
            time.sleep(self.sleeping_time)