Package concurrent_tree_crawler :: Module multithreaded_crawler
[hide private]
[frames] | no frames]

Source Code for Module concurrent_tree_crawler.multithreaded_crawler

  1  import os 
  2  import logging 
  3  import time 
  4  import datetime 
  5   
  6  from concurrent_tree_crawler.common.file_helper import lenient_makedir 
  7  from concurrent_tree_crawler.common.logger import Logger 
  8  from concurrent_tree_crawler.common.activity_schedule import AlwaysActiveSchedule 
  9  from concurrent_tree_crawler.crawlers_manager import CrawlersManager 
 10  from concurrent_tree_crawler.rw_lock_tree_accessor import RWLockTreeAccessor 
 11  from concurrent_tree_crawler.navigator_tree_wrapper import NavigatorTreeWrapper 
 12  from concurrent_tree_crawler.tree_saver_thread import TreeSaverThread 
 13  from concurrent_tree_crawler.abstract_node import NodeState 
 14  from concurrent_tree_crawler.xml_tree_serialization import XMLTreeReader 
15 16 -class MultithreadedCrawler:
17 """ 18 Runs several threads to crawl the tree. 19 20 It is also responsible for all the ancillary stuff: 21 makes sure that the state of the tree is saved to disk, 22 sets up the logging level etc. 23 """ 24
25 - def __init__(self, navigators, sentinel, activity_schedule=None, 26 log_file_path=None, state_file_path=None, save_period=None, 27 logging_level=logging.ERROR):
28 """ 29 @param navigators: list of navigators to be used by the crawler. 30 Each navigator will be run in a separate thread, thus the 31 number of the threads is equal to the number of navigators. 32 @type navigators: list of L{AbstractTreeNavigator}s 33 @param sentinel: a technical node which will be made parent of the 34 root node. 35 @type sentinel: L{AbstractNode} 36 @param activity_schedule: if C{None}, no schedule is used and the 37 program works until it finishes crawling. 38 @type activity_schedule: L{AbstractActivitySchedule} 39 @param log_file_path: path to the log file. If C{None}, no log file 40 will be used. 41 @param state_file_path: path to the file where the state of the 42 program will be saved. If C{None}, the state will not be saved. 43 @param save_period: time between saving the tree state. If 44 C{state_file_path} is C{None}, this value is ignored. 45 @param logging_level: one of the logging level constants from C{logging} 46 """ 47 if log_file_path is not None: 48 lenient_makedir(os.path.dirname(log_file_path)) 49 if state_file_path is not None: 50 if os.path.exists(state_file_path): 51 print "State file already exists. Loading the tree from this "\ 52 "file and changing nodes with state PROCESSING to OPEN ... ", 53 self.__load_state_file(state_file_path, sentinel) 54 print "Done." 55 else: 56 lenient_makedir(os.path.dirname(state_file_path)) 57 self.__tree = RWLockTreeAccessor(sentinel) 58 self.__navigators = navigators 59 self.__manager = None 60 self.__state_file_path = state_file_path 61 self.__save_period = save_period 62 self.__activity_schedule = activity_schedule 63 if activity_schedule is None: 64 self.__activity_schedule = AlwaysActiveSchedule() 65 self.__logging_level = logging_level 66 self.__log_file_path = log_file_path
67
68 - def run(self):
69 """ 70 @return: sentinel node 71 @rtype: L{AbstractNode} 72 """ 73 self.__manager = self._create_crawlers_manager( 74 self.__tree, self.__navigators) 75 if self.__log_file_path is not None: 76 Logger.start(file_path=self.__log_file_path, 77 logging_level=self.__logging_level) 78 while True: 79 activity_time = self.__sleep_until_activity_period() 80 saver_thread = None 81 if self.__state_file_path is not None: 82 saver_thread = self.__start_tree_saver_thread() 83 self.__manager.start() 84 threads_finished = \ 85 self.__manager.wait_until_finish(timeout=activity_time) 86 if self.__state_file_path is not None: 87 saver_thread.stop_activity() 88 saver_thread.join() 89 if threads_finished: 90 break 91 if self.__log_file_path is not None: 92 Logger.stop() 93 return self.__tree.get_sentinel()
94
95 - def _create_crawlers_manager(self, tree, navigators):
96 navigator_wrappers = [] 97 for navigator in navigators: 98 navigator_wrapper = NavigatorTreeWrapper(navigator, tree) 99 navigator_wrappers.append(navigator_wrapper) 100 return CrawlersManager(tree, navigator_wrappers)
101
103 t = TreeSaverThread( 104 self.__state_file_path, self.__tree, self.__save_period) 105 t.setDaemon(True) 106 t.start() 107 return t
108
110 """ 111 Sleep (stop program execution) until there's a time to wake up. 112 113 @return: activity time, i.e. time until the start of the next 114 sleep period, C{None} if such time point cannot be determined 115 (as in case when the activity time will not stop in future). 116 @rtype: number of seconds 117 """ 118 while True: 119 now = datetime.datetime.now() 120 info = self.__activity_schedule.get_activity_info(now) 121 if info.future_mode_change is None: 122 if info.is_in_activity_period: 123 return None 124 else: 125 raise Exception("Going to sleep forever?") 126 mode_change_time = (info.future_mode_change - now).total_seconds() 127 if not info.is_in_activity_period: 128 logging.info("Going to sleep for {:.1f} seconds " 129 "(according to schedule)".format( 130 mode_change_time)) 131 time.sleep(mode_change_time) 132 logging.info("Awaken") 133 else: 134 logging.info("Starting activity for {:.1f} seconds " 135 "(according to schedule)".format( 136 mode_change_time)) 137 return mode_change_time
138 139 @staticmethod
140 - def __load_state_file(file_path, sentinel):
141 with open(file_path) as f: 142 reader = XMLTreeReader(f) 143 reader.read(sentinel) 144 MultithreadedCrawler.__change_state_from_PROCESSING_to_OPEN( 145 sentinel.get_child("root"))
146 147 @staticmethod
153