Source code for http_request_randomizer.requests.parsers.ProxyForEuParser

import logging

import requests
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


[docs]class ProxyForEuParser(UrlParser): def __init__(self, web_url, bandwithdh=None): UrlParser.__init__(self, web_url, bandwithdh)
[docs] def parse_proxyList(self): curr_proxy_list = [] content = requests.get(self.get_URl()).content soup = BeautifulSoup(content, "html.parser") table = soup.find("table", attrs={"class": "proxy_list"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] datasets = [] for row in table.find_all("tr")[1:]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) datasets.append(dataset) for dataset in datasets: # Check Field[0] for tags and field[1] for values! address = "" proxy_straggler = False for field in dataset: # Discard slow proxies! Speed is in KB/s if field[0] == 'Speed': if float(field[1]) < self.get_min_bandwidth(): proxy_straggler = True if field[0] == 'IP': # Make sure it is a Valid IP if not UrlParser.valid_ip(field[1]): logger.debug("IP with Invalid format: {}".format(field[1])) break else: address += field[1] + ':' elif field[0] == 'Port': address += field[1] # Avoid Straggler proxies and make sure it is a Valid Proxy Address if not proxy_straggler and UrlParser.valid_ip_port(address): proxy = "http://" + address curr_proxy_list.append(proxy.__str__()) # print "{0:<10}: {1}".format(field[0], field[1]) # print "ALL: ", curr_proxy_list return curr_proxy_list
def __str__(self): return "ProxyForEU Parser of '{0}' with required bandwidth: '{1}' KBs" \ .format(self.url, self.minimum_bandwidth_in_KBs)