1 import os
2 import logging
3 import time
4 import datetime
5
6 from concurrent_tree_crawler.common.file_helper import lenient_makedir
7 from concurrent_tree_crawler.common.logger import Logger
8 from concurrent_tree_crawler.common.activity_schedule import AlwaysActiveSchedule
9 from concurrent_tree_crawler.crawlers_manager import CrawlersManager
10 from concurrent_tree_crawler.rw_lock_tree_accessor import RWLockTreeAccessor
11 from concurrent_tree_crawler.navigator_tree_wrapper import NavigatorTreeWrapper
12 from concurrent_tree_crawler.tree_saver_thread import TreeSaverThread
13 from concurrent_tree_crawler.abstract_node import NodeState
14 from concurrent_tree_crawler.xml_tree_serialization import XMLTreeReader
17 """
18 Runs several threads to crawl the tree.
19
20 It is also responsible for all the ancillary stuff:
21 makes sure that the state of the tree is saved to disk,
22 sets up the logging level etc.
23 """
24
25 - def __init__(self, navigators, sentinel, activity_schedule=None,
26 log_file_path=None, state_file_path=None, save_period=None,
27 logging_level=logging.ERROR):
28 """
29 @param navigators: list of navigators to be used by the crawler.
30 Each navigator will be run in a separate thread, thus the
31 number of the threads is equal to the number of navigators.
32 @type navigators: list of L{AbstractTreeNavigator}s
33 @param sentinel: a technical node which will be made parent of the
34 root node.
35 @type sentinel: L{AbstractNode}
36 @param activity_schedule: if C{None}, no schedule is used and the
37 program works until it finishes crawling.
38 @type activity_schedule: L{AbstractActivitySchedule}
39 @param log_file_path: path to the log file. If C{None}, no log file
40 will be used.
41 @param state_file_path: path to the file where the state of the
42 program will be saved. If C{None}, the state will not be saved.
43 @param save_period: time between saving the tree state. If
44 C{state_file_path} is C{None}, this value is ignored.
45 @param logging_level: one of the logging level constants from C{logging}
46 """
47 if log_file_path is not None:
48 lenient_makedir(os.path.dirname(log_file_path))
49 if state_file_path is not None:
50 if os.path.exists(state_file_path):
51 print "State file already exists. Loading the tree from this "\
52 "file and changing nodes with state PROCESSING to OPEN ... ",
53 self.__load_state_file(state_file_path, sentinel)
54 print "Done."
55 else:
56 lenient_makedir(os.path.dirname(state_file_path))
57 self.__tree = RWLockTreeAccessor(sentinel)
58 self.__navigators = navigators
59 self.__manager = None
60 self.__state_file_path = state_file_path
61 self.__save_period = save_period
62 self.__activity_schedule = activity_schedule
63 if activity_schedule is None:
64 self.__activity_schedule = AlwaysActiveSchedule()
65 self.__logging_level = logging_level
66 self.__log_file_path = log_file_path
67
69 """
70 @return: sentinel node
71 @rtype: L{AbstractNode}
72 """
73 self.__manager = self._create_crawlers_manager(
74 self.__tree, self.__navigators)
75 if self.__log_file_path is not None:
76 Logger.start(file_path=self.__log_file_path,
77 logging_level=self.__logging_level)
78 while True:
79 activity_time = self.__sleep_until_activity_period()
80 saver_thread = None
81 if self.__state_file_path is not None:
82 saver_thread = self.__start_tree_saver_thread()
83 self.__manager.start()
84 threads_finished = \
85 self.__manager.wait_until_finish(timeout=activity_time)
86 if self.__state_file_path is not None:
87 saver_thread.stop_activity()
88 saver_thread.join()
89 if threads_finished:
90 break
91 if self.__log_file_path is not None:
92 Logger.stop()
93 return self.__tree.get_sentinel()
94
96 navigator_wrappers = []
97 for navigator in navigators:
98 navigator_wrapper = NavigatorTreeWrapper(navigator, tree)
99 navigator_wrappers.append(navigator_wrapper)
100 return CrawlersManager(tree, navigator_wrappers)
101
108
110 """
111 Sleep (stop program execution) until there's a time to wake up.
112
113 @return: activity time, i.e. time until the start of the next
114 sleep period, C{None} if such time point cannot be determined
115 (as in case when the activity time will not stop in future).
116 @rtype: number of seconds
117 """
118 while True:
119 now = datetime.datetime.now()
120 info = self.__activity_schedule.get_activity_info(now)
121 if info.future_mode_change is None:
122 if info.is_in_activity_period:
123 return None
124 else:
125 raise Exception("Going to sleep forever?")
126 mode_change_time = (info.future_mode_change - now).total_seconds()
127 if not info.is_in_activity_period:
128 logging.info("Going to sleep for {:.1f} seconds "
129 "(according to schedule)".format(
130 mode_change_time))
131 time.sleep(mode_change_time)
132 logging.info("Awaken")
133 else:
134 logging.info("Starting activity for {:.1f} seconds "
135 "(according to schedule)".format(
136 mode_change_time))
137 return mode_change_time
138
139 @staticmethod
146
147 @staticmethod
153