Scallywag/rsrc/Scraper/proxylister.py

37 lines
1.2 KiB
Python
Raw Permalink Normal View History

2018-10-08 03:08:35 +00:00
import requests
from lxml import html
class Scraper:
def __init__( self, config ):
# the request client, belongs to session even if no "user session" is needed
self.client = requests.Session()
self.config = config
def get_proxies(self):
2021-07-06 05:34:57 +00:00
print("Fetching raw HTML from '{0}'".format( self.config.proxylist_url))
fetch_results = self.client.get( self.config.proxylist_url )
2018-10-08 03:08:35 +00:00
proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content )
return proxy_list
class Parser:
@staticmethod
def scrape( datapoint, text ):
cases = {
"proxy_list": Scraper.Parser.proxy_list
}
return cases[ datapoint ]( text )
@staticmethod
def proxy_list( text ):
proxyTable = html.fromstring( text )
proxyTable_xpath = proxyTable.xpath('//body[@id="mainPage"]/div[@class="container"]/div[@id="content"]/table[@id="searchResult"]/tr/td[@class="site"]/a/text()')
for proxy in proxyTable_xpath:
print("Available Proxy: {0}".format( proxy ) )
2018-10-08 03:08:35 +00:00
return proxyTable_xpath
class SessionError( Exception ):
def __init__( self, value ):
self.value = value