updated to reflect TPB site changes and various bug fixes

2020-08-02 23:37:05 -04:00 · 2020-08-02 23:37:05 -04:00 · 3cff05035b
parent 3725629ada
commit 3cff05035b
6 changed files with 39 additions and 16 deletions
--- a/2
+++ b/2
@ -69,7 +69,7 @@ class Scallywag:
        self.searcher = searcher.Scraper( self.config )
        results =  self.searcher.get_results( search_terms )
-
+        self.status("{0} results found.".format( len(results) ) )
        for result in results:
            yield result
--- a/config.ini
+++ b/config.ini
@ -1,2 +1,2 @@
 [list_source]
-piratebay_proxy_list = thepiratebay-proxylist.se
+piratebay_proxy_list = https://piratebayproxy.info/
--- a/rsrc/Scraper/pycache/proxylister.cpython-37.pyc
+++ b/rsrc/Scraper/pycache/proxylister.cpython-37.pyc
--- a/rsrc/Scraper/pycache/searcher.cpython-37.pyc
+++ b/rsrc/Scraper/pycache/searcher.cpython-37.pyc
--- a/rsrc/Scraper/proxylister.py
+++ b/rsrc/Scraper/proxylister.py
@ -9,7 +9,7 @@ class Scraper:
        self.config = config
    def get_proxies(self):
-        fetch_results = self.client.get( "https://" + self.config.proxylist_url )
+        fetch_results = self.client.get( self.config.proxylist_url )
        proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content )
        return proxy_list
@ -24,7 +24,9 @@ class Scraper:
        @staticmethod
        def proxy_list( text ):
            proxyTable = html.fromstring( text )
-            proxyTable_xpath = proxyTable.xpath( '//table[@class="proxies"]/tbody/tr/@data-domain' )
+            proxyTable_xpath = proxyTable.xpath('//body[@id="mainPage"]/div[@class="container"]/div[@id="content"]/table[@id="searchResult"]/tr/td[@class="site"]/a/text()')
            for proxy in proxyTable_xpath:
                print("Available Proxy: {0}".format( proxy ) )
            return proxyTable_xpath
    class SessionError( Exception ):
--- a/rsrc/Scraper/searcher.py
+++ b/rsrc/Scraper/searcher.py
@ -3,6 +3,7 @@ from lxml import html
 import urllib
 import re
 import json
 import sys
 class Result:
    def __init__(self, title, seeders, leechers, size, author, url):
@ -32,24 +33,30 @@ class Scraper:
    def craft_url(self, protocol, proxy, search_terms):
        # https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
-        f = { 'q': search_terms, 'category': 0, 'page': 0, 'orderby': 99 }
+        f = { 'q': search_terms, 'page': 0, 'orderby': 99 }
-        url = str.format( "{0}://{1}/s/?{2}", protocol, proxy, urllib.parse.urlencode(f) )
+        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
        url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )
        print(url)
        return url
    def get_results(self, search_terms):
        url = self.craft_url( "https", self.config.proxy, search_terms )
-        fetch_results = self.client.get( url )
+        try:
            fetch_results = self.client.get( url )
        except requests.exceptions.RequestException as e:
            print( e, file=sys.stderr )
        results_list = self.Parser.scrape( "results_list", fetch_results.content )
        return results_list
    def get_magnet(self, url):
-        url = "https://" + self.config.proxy + url
+        try:
-        fetch_results = self.client.get(url)
+            fetch_results = self.client.get(url)
-
+        except requests.exceptions.RequestException as e:
            print( e, file=sys.stderr )
        magnet = self.Parser.scrape( "magnet_link", fetch_results.content )
        return magnet
@ -71,26 +78,40 @@ class Scraper:
            results_buffer = list()
            for tr in resultsTable_xpath:
-                title = tr.xpath('td[2]/div[1]/a[1]/text()')
+                title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )
-                seeders = tr.xpath('td[3]/text()')[0]
+                seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )
                leechers = tr.xpath('td[4]/text()')[0]
                author = tr.xpath('td[2]/font/a/text()')
                size_unprocessed = tr.xpath('td[2]/font/text()')[0]
                url = tr.xpath('td/div[@class="detName"]/a[@class="detLink"]/@href')[0]
                leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )
                url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )
                size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )
                m = re.search('Size (.+?),', size_unprocessed)
                if m:
                    size = m.group(1)
                author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )
                print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )
                results_buffer.append(
                    Result(title, seeders, leechers, size, author, url)
                )
            # hack
            nav = results_buffer.pop()
            return results_buffer
        @staticmethod
        def scrape_helper( tr, xpathq ):
            try:
                val = tr.xpath( xpathq )[0]
            except IndexError:
                val = "0"
            return val
        @staticmethod
        def magnet_link( text ):
`@ -1,2 +1,2 @@`
	`[list_source]`	`[list_source]`
	`piratebay_proxy_list = thepiratebay-proxylist.se`	`piratebay_proxy_list = https://piratebayproxy.info/`