Package concurrent_tree_crawler :: Package html_multipage_navigator :: Module tree_navigator
[hide private]
[frames] | no frames]

Source Code for Module concurrent_tree_crawler.html_multipage_navigator.tree_navigator

  1  import logging 
  2  from collections import OrderedDict 
  3  from collections import deque 
  4   
  5  from concurrent_tree_crawler.abstract_tree_navigator import \ 
  6          AbstractTreeNavigator, NavigationException 
  7  from concurrent_tree_crawler.html_multipage_navigator.web_browser import \ 
  8          MechanizeBrowserCreator 
9 10 -class HTMLMultipageNavigator(AbstractTreeNavigator):
11 """ 12 A web site tree navigator. 13 14 It is assumed that all web pages corresponding to the nodes of the tree on 15 the given level have the same basic characteristics and are analyzed 16 in the same way, namely by the same object inheriting from 17 L{AbstractPageAnalyzer}. In particular, all of the leaf web pages are 18 placed on the same level of the tree. Some of the parts of the tree might 19 be missing, which results in marking certain nodes of the tree as ERROR. 20 """ 21 22 __repetition_suffix_template = "-repetition_{}" 23 __generate_new_name_max_repetitions = int(10e4) 24
25 - def __init__(self, address, levels, browser_creator=None):
26 """ 27 @param browser_creator: a creator of browsers that will be used 28 while crawling the web site. The default browser used here 29 is L{MechanizeBrowser}. 30 @type browser_creator: L{AbstractWebBrowserCreator} 31 @param levels: list of L{Level} objects. The first element is a level 32 corresponding to the root node, the last one corresponds to 33 leafs level. 34 @param address: URL address string 35 """ 36 self.__address = address 37 self.__browser_creator = browser_creator 38 if browser_creator is None: 39 self.__browser_creator = MechanizeBrowserCreator() 40 self.__br = None 41 self.__levels = levels 42 self.__path = None 43 self.__children_history = None 44 self.__current_children = None 45 """ 46 Info about children on current level of tree structure. 47 L{OrderedDictionary} with the key as child name and the value 48 as a link to child web page. 49 """
50
51 - def start_in_root(self):
52 self.__br = self.__browser_creator.create() 53 self.__br.open(self.__address) 54 self.__path = [self.__levels[0].name] 55 self.__children_history = _ChildrenHistory() 56 self.__current_children = self.__get_current_children()
57
58 - def get_path(self):
59 """ 60 @return: path to the tree node the navigator is currently in i.e. 61 subsequent node names from the tree root to the current node 62 """ 63 return self.__path
64
65 - def get_children(self):
66 return self.__current_children.keys()
67
68 - def __get_current_children(self):
69 children = OrderedDict() 70 page_index = 0 71 child_links_retrieved_so_far = 0 72 current_level = self.__levels[self.__get_current_level()] 73 while True: 74 page_links = \ 75 current_level.page_analyzer.get_links( 76 self.__br.response(), child_links_retrieved_so_far) 77 for (name, link) in page_links.children: 78 if name in children: 79 new_name = \ 80 self.__generate_new_name(name, children) 81 if new_name is None: 82 logging.error("Unable to generate a new name " 83 "for a repeating child name \"{}\" " 84 "(link=\"{}\") in node \"{}\": " 85 "all of the proposed new name variants are " 86 "already in use".format( 87 name, link, "/".join(self.__path))) 88 continue 89 children[new_name] = link 90 else: 91 children[name] = link 92 next_page_link = page_links.next_page_link 93 if next_page_link is None: 94 break 95 self.__br.open(next_page_link) 96 page_index += 1 97 child_links_retrieved_so_far += len(page_links.children) 98 if page_index > 0: 99 self.__br.back(page_index) ## Get back to the first page 100 return children
101 102 @staticmethod
103 - def __generate_new_name(original_name, children_dict):
104 max_rep = HTMLMultipageNavigator.__generate_new_name_max_repetitions 105 for i in xrange(max_rep): 106 name = original_name + \ 107 HTMLMultipageNavigator.__repetition_suffix_template.format(i+1) 108 if name not in children_dict: 109 return name 110 return None
111
112 - def __get_current_level(self):
113 return len(self.get_path())-1
114
115 - def __is_on_leafs_level(self):
116 return len(self.__path) == len(self.__levels)
117
118 - def move_to_child(self, child_name):
119 assert not self.__is_on_leafs_level() 120 try: 121 self.__br.open(self.__current_children[child_name]) 122 self.__path.append(child_name) 123 self.__children_history.push(self.__current_children) 124 self.__current_children = self.__get_current_children() 125 except Exception as ex: 126 raise NavigationException(ex)
127
128 - def move_to_parent(self):
129 assert self.__get_current_level() > 0 130 try: 131 self.__br.back() 132 self.__path = self.__path[:-1] 133 self.__current_children = self.__children_history.pop() 134 except Exception as ex: 135 raise NavigationException(ex)
136
138 try: 139 response = self.__br.response() 140 analyzer = self.__levels[self.__get_current_level()].page_analyzer 141 analyzer.process(self.__path, response) 142 if len(self.__current_children) > 0: 143 return False 144 return True 145 except Exception as ex: 146 raise NavigationException(ex)
147
148 -class _ChildrenHistory:
149 """ 150 A history of children nodes on consecutive levels of hierarchy. 151 Implemented as a FIFO queue. 152 """ 153
154 - def __init__(self):
155 self.__queue = deque()
156
157 - def push(self, children):
158 """ 159 @param children: info about children on current level of tree structure 160 @type children: L{OrderedDictionary} with the key being the child name 161 and the value being the link to child web page. 162 """ 163 self.__queue.append(children)
164
165 - def pop(self):
166 """ 167 @return: info about children on current level of tree structure 168 @rtype: L{OrderedDictionary} with the key being the child name 169 and the value being the link to child web page. 170 """ 171 return self.__queue.pop()
172