1 import logging
2 import datetime
3 import os.path
4 import argparse
5
6 from concurrent_tree_crawler.standard_node import StandardNode
7 from concurrent_tree_crawler.abstract_node import NodeState
8 from concurrent_tree_crawler.multithreaded_crawler import MultithreadedCrawler
9 from concurrent_tree_crawler.common.activity_schedule import \
10 DaySchedule, AlwaysActiveSchedule
13 """
14 A class that creates the L{MultithreadedCrawler} object based on
15 command-line parameters
16 """
17
18 __default_threads_no = 2
19 __save_period = 0.1
20
22 """@type navigators_creator: L{AbstractCmdLnNavigatorsCreator}"""
23 self.__navigators_creator = navigators_creator
24
26 parser = argparse.ArgumentParser()
27 parser.add_argument("state_file",
28 help="Path to file where the state of the algorithm is saved.")
29 parser.add_argument("--threads", type=int,
30 default=self.__default_threads_no, help="number of crawler threads")
31 parser.add_argument("-v", "--verbose", action="append_const",
32 const=None,
33 help="If used once, shows warnings while running the program; "
34 "if used twice, shows debug info while running the program.")
35 parser.add_argument("--log_file",
36 help="If this options is set, the logging information "
37 "will be printed not only to standard output, but "
38 "also to selected log file.")
39 parser.add_argument("--daily_schedule",
40 help='Daily start and stop times of the crawler program in form of '
41 '"start_time-end_time" e.g. "12:30-16:45" or "12-12:30:55". '
42 'If this option is not set, '
43 'no schedule is used and the program works '
44 'until it finishes its task.')
45 self.__navigators_creator.fill_parser(parser)
46 args = parser.parse_args()
47 return args
48
49 @staticmethod
57
58 @staticmethod
60 msg = "Summary of the run:\n"
61 if root.get_state() == NodeState.ERROR:
62 msg += "There were some problems while exploring the tree. "\
63 "As a result, probably not all tree nodes were properly processed. "\
64 "See the state file ({}) to check which nodes weren't properly "\
65 "processed (these are the nodes with \"ERROR\" state). ".format(
66 os.path.abspath(state_file_path))
67 if log_file_path is not None:
68 msg += "You can also consult the log file ({}).".format(
69 os.path.abspath(log_file_path))
70 elif root.get_state() == NodeState.CLOSED:
71 msg += "The whole tree has been explored; "\
72 "all of the nodes were correctly processed."
73 else:
74 msg += "The tree has not been yet fully explored and processed."
75 return msg
76
77 @staticmethod
79 if len(args.verbose) == 0:
80 return logging.ERROR
81 elif len(args.verbose) == 1:
82 return logging.WARNING
83 else:
84 return logging.DEBUG
85
86
88 args = self.__parse()
89 threads_no = args.threads
90 logging_level = self.__get_logging_level(args)
91 log_file_path = args.log_file
92 schedule = self.__get_schedule(args.daily_schedule)
93
94 navigators = self.__navigators_creator.create(args, threads_no)
95
96 sentinel = StandardNode()
97 prog = MultithreadedCrawler(navigators, sentinel, schedule,
98 log_file_path, args.state_file, self.__save_period, logging_level)
99 print "Starting activity with {} threads, "\
100 "activity daily schedule: {}".format(
101 threads_no, args.daily_schedule)
102 prog.run()
103 root = sentinel.get_child("root")
104
105 self.__navigators_creator.on_exit()
106
107 print "Done.\n"
108 print self.__get_tree_summary(root, args.state_file, log_file_path)
109
111 @staticmethod
113 elems = string.split(":")
114 if len(elems) == 1:
115 return datetime.time(hour=int(elems[0]))
116 if len(elems) == 2:
117 return datetime.time(hour=int(elems[0]), minute=int(elems[1]))
118 if len(elems) == 3:
119 return datetime.time(hour=int(elems[0]),
120 minute=int(elems[1]), second=int(elems[2]))
121 raise Exception("Not supported time format \"{}\"".format(string))
122
123 @staticmethod
125 """
126 Parse time interval string of format e.g. '04:56:04-12:44'
127
128 @return: (start, end), where both values are of type C{date.datetime}
129 """
130 parts = string.split("-")
131 start_time = _TimeParser.parse_time(parts[0])
132 end_time = _TimeParser.parse_time(parts[1])
133 return (start_time, end_time)
134