Package concurrent_tree_crawler :: Package html_multipage_navigator :: Module sample_page_analyzer
[hide private]
[frames] | no frames]

Source Code for Module concurrent_tree_crawler.html_multipage_navigator.sample_page_analyzer

  1  import re 
  2  import shutil 
  3  import os.path 
  4  from xml.etree.ElementTree import ElementTree 
  5  from concurrent_tree_crawler.abstract_tree_navigator import NavigationException 
  6  from concurrent_tree_crawler.common.file_helper import lenient_makedir 
  7  from concurrent_tree_crawler.html_multipage_navigator.abstract_page_analyzer \ 
  8          import PageLinks, AbstractPageAnalyzer, Level, AbstractLevelsCreator 
9 10 -class PageAnalyzerException(NavigationException):
11 pass
12
13 -class MagazinePageAnalyzer(AbstractPageAnalyzer):
14 """A class that parses magazine-level pages""" 15 29 30 @staticmethod
31 - def __convert_date(text):
32 months = {"January": 1, "February": 2, "March": 3, "April": 4, 33 "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, 34 "October": 10, "November": 11, "December": 12} 35 (month_str, day_str, year_str) = \ 36 re.match("^(\w+) (\d+), (\d+)$", text).group(1, 2, 3) 37 month = _convert_to_2_digit_number( 38 int(months[month_str])) 39 day = _convert_to_2_digit_number(int(day_str)) 40 return "{}-{}-{}".format(year_str, month, day)
41
42 -class IssuePageAnalyzer(AbstractPageAnalyzer):
43 """A class that parses issues-level pages""" 44
45 - def __init__(self, dst_dir_path):
46 self.__dst_dir_path = dst_dir_path
47
48 - def process(self, tree_path, page_file):
49 assert len(tree_path) > 0 50 dir_path = os.path.join(self.__dst_dir_path, 51 _convert_tree_path_to_dir_path(tree_path)) 52 lenient_makedir(dir_path) 53 error_page_path = os.path.join(dir_path, "error.txt") 54 _handle_error_page(page_file, error_page_path)
55
71
72 -class ArticlePageAnalyzer(AbstractPageAnalyzer):
73 """A class that downloads article pages""" 74
75 - def __init__(self, dst_dir_path):
76 self.__dst_dir_path = dst_dir_path
77
78 - def process(self, tree_path, page_file):
79 assert len(tree_path) > 0 80 dir_path = os.path.join(self.__dst_dir_path, 81 _convert_tree_path_to_dir_path(tree_path[:-1])) 82 lenient_makedir(dir_path) 83 error_page_path = os.path.join(dir_path, tree_path[-1]+"-error.txt") 84 _handle_error_page(page_file, error_page_path) 85 file_path = os.path.join(dir_path, tree_path[-1]+".html") 86 self.__download_page(page_file, file_path)
87
88 - def __download_page(self, page_file, dst_file):
89 f = open(dst_file, 'wb') 90 shutil.copyfileobj(page_file, f) 91 f.close()
92
93 -class LevelsCreator(AbstractLevelsCreator):
94 - def __init__(self, download_dir_path):
95 self.__download_dir_path = download_dir_path
96
97 - def create(self):
98 return [Level("magazine", MagazinePageAnalyzer()), 99 Level("issue", IssuePageAnalyzer(self.__download_dir_path)), 100 Level("article", ArticlePageAnalyzer(self.__download_dir_path))]
101
102 -def _handle_error_page(page_file, file_path):
103 """@return: C{True} iff the given page is an error page""" 104 if not _is_error_page(page_file): 105 return 106 with open(file_path, "w") as f: 107 print >>f, "An error message shown after requesting the page" 108 raise PageAnalyzerException("Error page encountered")
109
110 -def _is_error_page(page_file):
111 doc = ElementTree(file=page_file) 112 title = doc.find("head/title") 113 page_file.seek(0) ## reset file's current position 114 if title.text.find("not available") != -1: 115 return True 116 return False
117
118 -def _convert_tree_path_to_dir_path(tree_path):
119 ## Skip root element 120 return '/'.join(tree_path[1:])
121
122 -def _convert_to_2_digit_number(number):
123 assert number > 0 and number < 99 124 str_ = str(number) 125 if number < 10: 126 return "0"+str_ 127 return str_
128