1 import logging
2 from collections import OrderedDict
3 from collections import deque
4
5 from concurrent_tree_crawler.abstract_tree_navigator import \
6 AbstractTreeNavigator, NavigationException
7 from concurrent_tree_crawler.html_multipage_navigator.web_browser import \
8 MechanizeBrowserCreator
9
10 -class HTMLMultipageNavigator(AbstractTreeNavigator):
11 """
12 A web site tree navigator.
13
14 It is assumed that all web pages corresponding to the nodes of the tree on
15 the given level have the same basic characteristics and are analyzed
16 in the same way, namely by the same object inheriting from
17 L{AbstractPageAnalyzer}. In particular, all of the leaf web pages are
18 placed on the same level of the tree. Some of the parts of the tree might
19 be missing, which results in marking certain nodes of the tree as ERROR.
20 """
21
22 __repetition_suffix_template = "-repetition_{}"
23 __generate_new_name_max_repetitions = int(10e4)
24
25 - def __init__(self, address, levels, browser_creator=None):
26 """
27 @param browser_creator: a creator of browsers that will be used
28 while crawling the web site. The default browser used here
29 is L{MechanizeBrowser}.
30 @type browser_creator: L{AbstractWebBrowserCreator}
31 @param levels: list of L{Level} objects. The first element is a level
32 corresponding to the root node, the last one corresponds to
33 leafs level.
34 @param address: URL address string
35 """
36 self.__address = address
37 self.__browser_creator = browser_creator
38 if browser_creator is None:
39 self.__browser_creator = MechanizeBrowserCreator()
40 self.__br = None
41 self.__levels = levels
42 self.__path = None
43 self.__children_history = None
44 self.__current_children = None
45 """
46 Info about children on current level of tree structure.
47 L{OrderedDictionary} with the key as child name and the value
48 as a link to child web page.
49 """
50
51 - def start_in_root(self):
52 self.__br = self.__browser_creator.create()
53 self.__br.open(self.__address)
54 self.__path = [self.__levels[0].name]
55 self.__children_history = _ChildrenHistory()
56 self.__current_children = self.__get_current_children()
57
59 """
60 @return: path to the tree node the navigator is currently in i.e.
61 subsequent node names from the tree root to the current node
62 """
63 return self.__path
64
65 - def get_children(self):
66 return self.__current_children.keys()
67
69 children = OrderedDict()
70 page_index = 0
71 child_links_retrieved_so_far = 0
72 current_level = self.__levels[self.__get_current_level()]
73 while True:
74 page_links = \
75 current_level.page_analyzer.get_links(
76 self.__br.response(), child_links_retrieved_so_far)
77 for (name, link) in page_links.children:
78 if name in children:
79 new_name = \
80 self.__generate_new_name(name, children)
81 if new_name is None:
82 logging.error("Unable to generate a new name "
83 "for a repeating child name \"{}\" "
84 "(link=\"{}\") in node \"{}\": "
85 "all of the proposed new name variants are "
86 "already in use".format(
87 name, link, "/".join(self.__path)))
88 continue
89 children[new_name] = link
90 else:
91 children[name] = link
92 next_page_link = page_links.next_page_link
93 if next_page_link is None:
94 break
95 self.__br.open(next_page_link)
96 page_index += 1
97 child_links_retrieved_so_far += len(page_links.children)
98 if page_index > 0:
99 self.__br.back(page_index)
100 return children
101
102 @staticmethod
103 - def __generate_new_name(original_name, children_dict):
104 max_rep = HTMLMultipageNavigator.__generate_new_name_max_repetitions
105 for i in xrange(max_rep):
106 name = original_name + \
107 HTMLMultipageNavigator.__repetition_suffix_template.format(i+1)
108 if name not in children_dict:
109 return name
110 return None
111
113 return len(self.get_path())-1
114
116 return len(self.__path) == len(self.__levels)
117
118 - def move_to_child(self, child_name):
119 assert not self.__is_on_leafs_level()
120 try:
121 self.__br.open(self.__current_children[child_name])
122 self.__path.append(child_name)
123 self.__children_history.push(self.__current_children)
124 self.__current_children = self.__get_current_children()
125 except Exception as ex:
126 raise NavigationException(ex)
127
128 - def move_to_parent(self):
129 assert self.__get_current_level() > 0
130 try:
131 self.__br.back()
132 self.__path = self.__path[:-1]
133 self.__current_children = self.__children_history.pop()
134 except Exception as ex:
135 raise NavigationException(ex)
136
138 try:
139 response = self.__br.response()
140 analyzer = self.__levels[self.__get_current_level()].page_analyzer
141 analyzer.process(self.__path, response)
142 if len(self.__current_children) > 0:
143 return False
144 return True
145 except Exception as ex:
146 raise NavigationException(ex)
147
149 """
150 A history of children nodes on consecutive levels of hierarchy.
151 Implemented as a FIFO queue.
152 """
153
154 - def __init__(self):
155 self.__queue = deque()
156
157 - def push(self, children):
158 """
159 @param children: info about children on current level of tree structure
160 @type children: L{OrderedDictionary} with the key being the child name
161 and the value being the link to child web page.
162 """
163 self.__queue.append(children)
164
166 """
167 @return: info about children on current level of tree structure
168 @rtype: L{OrderedDictionary} with the key being the child name
169 and the value being the link to child web page.
170 """
171 return self.__queue.pop()
172