Package concurrent_tree_crawler :: Module cmdln_multithreaded_crawler
[hide private]
[frames] | no frames]

Source Code for Module concurrent_tree_crawler.cmdln_multithreaded_crawler

  1  import logging 
  2  import datetime 
  3  import os.path 
  4  import argparse 
  5   
  6  from concurrent_tree_crawler.standard_node import StandardNode 
  7  from concurrent_tree_crawler.abstract_node import NodeState 
  8  from concurrent_tree_crawler.multithreaded_crawler import MultithreadedCrawler 
  9  from concurrent_tree_crawler.common.activity_schedule import \ 
 10          DaySchedule, AlwaysActiveSchedule 
11 12 -class CmdLnMultithreadedCrawler:
13 """ 14 A class that creates the L{MultithreadedCrawler} object based on 15 command-line parameters 16 """ 17 18 __default_threads_no = 2 19 __save_period = 0.1 20
21 - def __init__(self, navigators_creator):
22 """@type navigators_creator: L{AbstractCmdLnNavigatorsCreator}""" 23 self.__navigators_creator = navigators_creator
24
25 - def __parse(self):
26 parser = argparse.ArgumentParser() 27 parser.add_argument("state_file", 28 help="Path to file where the state of the algorithm is saved.") 29 parser.add_argument("--threads", type=int, 30 default=self.__default_threads_no, help="number of crawler threads") 31 parser.add_argument("-v", "--verbose", action="append_const", 32 const=None, 33 help="If used once, shows warnings while running the program; " 34 "if used twice, shows debug info while running the program.") 35 parser.add_argument("--log_file", 36 help="If this options is set, the logging information " 37 "will be printed not only to standard output, but " 38 "also to selected log file.") 39 parser.add_argument("--daily_schedule", 40 help='Daily start and stop times of the crawler program in form of ' 41 '"start_time-end_time" e.g. "12:30-16:45" or "12-12:30:55". ' 42 'If this option is not set, ' 43 'no schedule is used and the program works ' 44 'until it finishes its task.') 45 self.__navigators_creator.fill_parser(parser) 46 args = parser.parse_args() 47 return args
48 49 @staticmethod
50 - def __get_schedule(schedule_string):
51 if schedule_string is not None: 52 start_time, end_time = _TimeParser.parse_time_interval( 53 schedule_string) 54 return DaySchedule(start_time, end_time) 55 else: 56 return AlwaysActiveSchedule()
57 58 @staticmethod
59 - def __get_tree_summary(root, state_file_path, log_file_path):
60 msg = "Summary of the run:\n" 61 if root.get_state() == NodeState.ERROR: 62 msg += "There were some problems while exploring the tree. "\ 63 "As a result, probably not all tree nodes were properly processed. "\ 64 "See the state file ({}) to check which nodes weren't properly "\ 65 "processed (these are the nodes with \"ERROR\" state). ".format( 66 os.path.abspath(state_file_path)) 67 if log_file_path is not None: 68 msg += "You can also consult the log file ({}).".format( 69 os.path.abspath(log_file_path)) 70 elif root.get_state() == NodeState.CLOSED: 71 msg += "The whole tree has been explored; "\ 72 "all of the nodes were correctly processed." 73 else: 74 msg += "The tree has not been yet fully explored and processed." 75 return msg
76 77 @staticmethod
78 - def __get_logging_level(args):
79 if len(args.verbose) == 0: 80 return logging.ERROR 81 elif len(args.verbose) == 1: 82 return logging.WARNING 83 else: 84 return logging.DEBUG
85 86
87 - def run(self):
88 args = self.__parse() 89 threads_no = args.threads 90 logging_level = self.__get_logging_level(args) 91 log_file_path = args.log_file 92 schedule = self.__get_schedule(args.daily_schedule) 93 94 navigators = self.__navigators_creator.create(args, threads_no) 95 96 sentinel = StandardNode() 97 prog = MultithreadedCrawler(navigators, sentinel, schedule, 98 log_file_path, args.state_file, self.__save_period, logging_level) 99 print "Starting activity with {} threads, "\ 100 "activity daily schedule: {}".format( 101 threads_no, args.daily_schedule) 102 prog.run() 103 root = sentinel.get_child("root") 104 105 self.__navigators_creator.on_exit() 106 107 print "Done.\n" 108 print self.__get_tree_summary(root, args.state_file, log_file_path)
109
110 -class _TimeParser:
111 @staticmethod
112 - def parse_time(string):
113 elems = string.split(":") 114 if len(elems) == 1: 115 return datetime.time(hour=int(elems[0])) 116 if len(elems) == 2: 117 return datetime.time(hour=int(elems[0]), minute=int(elems[1])) 118 if len(elems) == 3: 119 return datetime.time(hour=int(elems[0]), 120 minute=int(elems[1]), second=int(elems[2])) 121 raise Exception("Not supported time format \"{}\"".format(string))
122 123 @staticmethod
124 - def parse_time_interval(string):
125 """ 126 Parse time interval string of format e.g. '04:56:04-12:44' 127 128 @return: (start, end), where both values are of type C{date.datetime} 129 """ 130 parts = string.split("-") 131 start_time = _TimeParser.parse_time(parts[0]) 132 end_time = _TimeParser.parse_time(parts[1]) 133 return (start_time, end_time)
134