1 import re
2 import shutil
3 import os.path
4 from xml.etree.ElementTree import ElementTree
5 from concurrent_tree_crawler.abstract_tree_navigator import NavigationException
6 from concurrent_tree_crawler.common.file_helper import lenient_makedir
7 from concurrent_tree_crawler.html_multipage_navigator.abstract_page_analyzer \
8 import PageLinks, AbstractPageAnalyzer, Level, AbstractLevelsCreator
9
10 -class PageAnalyzerException(NavigationException):
12
13 -class MagazinePageAnalyzer(AbstractPageAnalyzer):
14 """A class that parses magazine-level pages"""
15
16 - def get_links(self, page_file, child_links_retrieved_so_far):
17 children = []
18 doc = ElementTree(file=page_file)
19 links = doc.findall("body/div/div[@id='elements']/table/tr/td/a")
20 for link in links:
21 link_text = self.__convert_date(link.text)
22 children.append((link_text, link.attrib["href"]))
23 next_page_elem = \
24 doc.find("body/div/div[@id='navigation']/table/tr/td[3]/a")
25 next_page_link = None
26 if next_page_elem is not None:
27 next_page_link = next_page_elem.attrib["href"]
28 return PageLinks(children, next_page_link)
29
30 @staticmethod
31 - def __convert_date(text):
32 months = {"January": 1, "February": 2, "March": 3, "April": 4,
33 "May": 5, "June": 6, "July": 7, "August": 8, "September": 9,
34 "October": 10, "November": 11, "December": 12}
35 (month_str, day_str, year_str) = \
36 re.match("^(\w+) (\d+), (\d+)$", text).group(1, 2, 3)
37 month = _convert_to_2_digit_number(
38 int(months[month_str]))
39 day = _convert_to_2_digit_number(int(day_str))
40 return "{}-{}-{}".format(year_str, month, day)
41
42 -class IssuePageAnalyzer(AbstractPageAnalyzer):
43 """A class that parses issues-level pages"""
44
45 - def __init__(self, dst_dir_path):
46 self.__dst_dir_path = dst_dir_path
47
48 - def process(self, tree_path, page_file):
49 assert len(tree_path) > 0
50 dir_path = os.path.join(self.__dst_dir_path,
51 _convert_tree_path_to_dir_path(tree_path))
52 lenient_makedir(dir_path)
53 error_page_path = os.path.join(dir_path, "error.txt")
54 _handle_error_page(page_file, error_page_path)
55
56 - def get_links(self, page_file, child_links_retrieved_so_far):
57 children = []
58 doc = ElementTree(file=page_file)
59 links = doc.findall("body/div/div[@id='elements']/table/tr/td/a")
60 for i, link in enumerate(links):
61
62 link_text = _convert_to_2_digit_number(
63 child_links_retrieved_so_far + i+1)
64 children.append((link_text, link.attrib["href"]))
65 next_page_elem = \
66 doc.find("body/div/div[@id='navigation']/table/tr/td[3]/a")
67 next_page_link = None
68 if next_page_elem is not None:
69 next_page_link = next_page_elem.attrib["href"]
70 return PageLinks(children, next_page_link)
71
72 -class ArticlePageAnalyzer(AbstractPageAnalyzer):
73 """A class that downloads article pages"""
74
75 - def __init__(self, dst_dir_path):
76 self.__dst_dir_path = dst_dir_path
77
78 - def process(self, tree_path, page_file):
79 assert len(tree_path) > 0
80 dir_path = os.path.join(self.__dst_dir_path,
81 _convert_tree_path_to_dir_path(tree_path[:-1]))
82 lenient_makedir(dir_path)
83 error_page_path = os.path.join(dir_path, tree_path[-1]+"-error.txt")
84 _handle_error_page(page_file, error_page_path)
85 file_path = os.path.join(dir_path, tree_path[-1]+".html")
86 self.__download_page(page_file, file_path)
87
88 - def __download_page(self, page_file, dst_file):
89 f = open(dst_file, 'wb')
90 shutil.copyfileobj(page_file, f)
91 f.close()
92
95 self.__download_dir_path = download_dir_path
96
101
102 -def _handle_error_page(page_file, file_path):
103 """@return: C{True} iff the given page is an error page"""
104 if not _is_error_page(page_file):
105 return
106 with open(file_path, "w") as f:
107 print >>f, "An error message shown after requesting the page"
108 raise PageAnalyzerException("Error page encountered")
109
110 -def _is_error_page(page_file):
111 doc = ElementTree(file=page_file)
112 title = doc.find("head/title")
113 page_file.seek(0)
114 if title.text.find("not available") != -1:
115 return True
116 return False
117
119
120 return '/'.join(tree_path[1:])
121
123 assert number > 0 and number < 99
124 str_ = str(number)
125 if number < 10:
126 return "0"+str_
127 return str_
128