Source code for http_request_randomizer.requests.parsers.SamairProxyParser

import logging

import requests
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


[docs]class SamairProxyParser(UrlParser): def __init__(self, web_url): UrlParser.__init__(self, web_url)
[docs] def parse_proxyList(self): curr_proxy_list = [] content = requests.get(self.get_URl()).content soup = BeautifulSoup(content, "html.parser") # css provides the port number so we reverse it # for href in soup.findAll('link'): # if '/styles/' in href.get('href'): # style = "http://www.samair.ru" + href.get('href') # break # css = requests.get(style).content.split('\n') # css.pop() # ports = {} # for l in css: # p = l.split(' ') # key = p[0].split(':')[0][1:] # value = p[1].split('\"')[1] # ports[key] = value table = soup.find("table", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] for row in table.find_all("tr")[1:]: td_row = row.find("td") # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) # Make sure it is a Valid Proxy Address if UrlParser.valid_ip_port(td_row.text): curr_proxy_list.append('http://' +td_row.text) else: logger.debug("Address with Invalid format: {}".format(td_row.text)) return curr_proxy_list
def __str__(self): return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \ .format(self.url, self.minimum_bandwidth_in_KBs)