diff --git a/Scallywag b/Scallywag index 368011a..eb13578 100755 --- a/Scallywag +++ b/Scallywag @@ -69,7 +69,7 @@ class Scallywag: self.searcher = searcher.Scraper( self.config ) results = self.searcher.get_results( search_terms ) - + self.status("{0} results found.".format( len(results) ) ) for result in results: yield result diff --git a/config.ini b/config.ini index a3b3b8e..00d56e2 100644 --- a/config.ini +++ b/config.ini @@ -1,2 +1,2 @@ [list_source] -piratebay_proxy_list = thepiratebay-proxylist.se +piratebay_proxy_list = https://piratebayproxy.info/ diff --git a/rsrc/Scraper/__pycache__/proxylister.cpython-37.pyc b/rsrc/Scraper/__pycache__/proxylister.cpython-37.pyc index 772e9b9..59d3873 100644 Binary files a/rsrc/Scraper/__pycache__/proxylister.cpython-37.pyc and b/rsrc/Scraper/__pycache__/proxylister.cpython-37.pyc differ diff --git a/rsrc/Scraper/__pycache__/searcher.cpython-37.pyc b/rsrc/Scraper/__pycache__/searcher.cpython-37.pyc index 3b154d6..09a1533 100644 Binary files a/rsrc/Scraper/__pycache__/searcher.cpython-37.pyc and b/rsrc/Scraper/__pycache__/searcher.cpython-37.pyc differ diff --git a/rsrc/Scraper/proxylister.py b/rsrc/Scraper/proxylister.py index 9c4ad97..951eac2 100644 --- a/rsrc/Scraper/proxylister.py +++ b/rsrc/Scraper/proxylister.py @@ -9,7 +9,7 @@ class Scraper: self.config = config def get_proxies(self): - fetch_results = self.client.get( "https://" + self.config.proxylist_url ) + fetch_results = self.client.get( self.config.proxylist_url ) proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content ) return proxy_list @@ -24,7 +24,9 @@ class Scraper: @staticmethod def proxy_list( text ): proxyTable = html.fromstring( text ) - proxyTable_xpath = proxyTable.xpath( '//table[@class="proxies"]/tbody/tr/@data-domain' ) + proxyTable_xpath = proxyTable.xpath('//body[@id="mainPage"]/div[@class="container"]/div[@id="content"]/table[@id="searchResult"]/tr/td[@class="site"]/a/text()') + for proxy in proxyTable_xpath: + print("Available Proxy: {0}".format( proxy ) ) return proxyTable_xpath class SessionError( Exception ): diff --git a/rsrc/Scraper/searcher.py b/rsrc/Scraper/searcher.py index 08363e8..257391d 100644 --- a/rsrc/Scraper/searcher.py +++ b/rsrc/Scraper/searcher.py @@ -3,6 +3,7 @@ from lxml import html import urllib import re import json +import sys class Result: def __init__(self, title, seeders, leechers, size, author, url): @@ -32,24 +33,30 @@ class Scraper: def craft_url(self, protocol, proxy, search_terms): # https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99 - f = { 'q': search_terms, 'category': 0, 'page': 0, 'orderby': 99 } - url = str.format( "{0}://{1}/s/?{2}", protocol, proxy, urllib.parse.urlencode(f) ) + f = { 'q': search_terms, 'page': 0, 'orderby': 99 } + # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha + # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha + url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) ) print(url) return url def get_results(self, search_terms): url = self.craft_url( "https", self.config.proxy, search_terms ) - fetch_results = self.client.get( url ) + try: + fetch_results = self.client.get( url ) + except requests.exceptions.RequestException as e: + print( e, file=sys.stderr ) results_list = self.Parser.scrape( "results_list", fetch_results.content ) return results_list def get_magnet(self, url): - url = "https://" + self.config.proxy + url - fetch_results = self.client.get(url) - + try: + fetch_results = self.client.get(url) + except requests.exceptions.RequestException as e: + print( e, file=sys.stderr ) magnet = self.Parser.scrape( "magnet_link", fetch_results.content ) return magnet @@ -71,26 +78,40 @@ class Scraper: results_buffer = list() for tr in resultsTable_xpath: - title = tr.xpath('td[2]/div[1]/a[1]/text()') - seeders = tr.xpath('td[3]/text()')[0] - leechers = tr.xpath('td[4]/text()')[0] - author = tr.xpath('td[2]/font/a/text()') - size_unprocessed = tr.xpath('td[2]/font/text()')[0] - url = tr.xpath('td/div[@class="detName"]/a[@class="detLink"]/@href')[0] + title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' ) + seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' ) + leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' ) + url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' ) + + size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' ) m = re.search('Size (.+?),', size_unprocessed) if m: size = m.group(1) + author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' ) + + + print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) ) results_buffer.append( Result(title, seeders, leechers, size, author, url) ) + # hack + nav = results_buffer.pop() + return results_buffer + @staticmethod + def scrape_helper( tr, xpathq ): + try: + val = tr.xpath( xpathq )[0] + except IndexError: + val = "0" + return val @staticmethod def magnet_link( text ):