updated to reflect TPB site changes and various bug fixes

2020-08-02 23:37:05 -04:00 · 2020-08-02 23:37:05 -04:00 · 3cff05035b
parent 3725629ada
commit 3cff05035b
6 changed files with 39 additions and 16 deletions
--- a/2
+++ b/2
@ -69,7 +69,7 @@ class Scallywag:
        self.searcher = searcher.Scraper( self.config )

        results =  self.searcher.get_results( search_terms )
-
+        self.status("{0} results found.".format( len(results) ) )
        for result in results:
            yield result

--- a/config.ini
+++ b/config.ini
@ -1,2 +1,2 @@
 [list_source]
-piratebay_proxy_list = thepiratebay-proxylist.se
+piratebay_proxy_list = https://piratebayproxy.info/
--- a/rsrc/Scraper/pycache/proxylister.cpython-37.pyc
+++ b/rsrc/Scraper/pycache/proxylister.cpython-37.pyc
--- a/rsrc/Scraper/pycache/searcher.cpython-37.pyc
+++ b/rsrc/Scraper/pycache/searcher.cpython-37.pyc
--- a/rsrc/Scraper/proxylister.py
+++ b/rsrc/Scraper/proxylister.py
@ -9,7 +9,7 @@ class Scraper:
        self.config = config

    def get_proxies(self):
-        fetch_results = self.client.get( "https://" + self.config.proxylist_url )
+        fetch_results = self.client.get( self.config.proxylist_url )
        proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content )
        return proxy_list

@ -24,7 +24,9 @@ class Scraper:
        @staticmethod
        def proxy_list( text ):
            proxyTable = html.fromstring( text )
-            proxyTable_xpath = proxyTable.xpath( '//table[@class="proxies"]/tbody/tr/@data-domain' )
+            proxyTable_xpath = proxyTable.xpath('//body[@id="mainPage"]/div[@class="container"]/div[@id="content"]/table[@id="searchResult"]/tr/td[@class="site"]/a/text()')
+            for proxy in proxyTable_xpath:
+                print("Available Proxy: {0}".format( proxy ) )
            return proxyTable_xpath

    class SessionError( Exception ):
--- a/rsrc/Scraper/searcher.py
+++ b/rsrc/Scraper/searcher.py
@ -3,6 +3,7 @@ from lxml import html
 import urllib
 import re
 import json
+import sys

 class Result:
    def __init__(self, title, seeders, leechers, size, author, url):
@ -32,24 +33,30 @@ class Scraper:

    def craft_url(self, protocol, proxy, search_terms):
        # https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
-        f = { 'q': search_terms, 'category': 0, 'page': 0, 'orderby': 99 }
-        url = str.format( "{0}://{1}/s/?{2}", protocol, proxy, urllib.parse.urlencode(f) )
+        f = { 'q': search_terms, 'page': 0, 'orderby': 99 }
+        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
+        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
+        url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )
        print(url)
        return url

    def get_results(self, search_terms):
        url = self.craft_url( "https", self.config.proxy, search_terms )

-        fetch_results = self.client.get( url )
+        try:
+            fetch_results = self.client.get( url )
+        except requests.exceptions.RequestException as e:
+            print( e, file=sys.stderr )
        results_list = self.Parser.scrape( "results_list", fetch_results.content )

        return results_list


    def get_magnet(self, url):
-        url = "https://" + self.config.proxy + url
-        fetch_results = self.client.get(url)
-
+        try:
+            fetch_results = self.client.get(url)
+        except requests.exceptions.RequestException as e:
+            print( e, file=sys.stderr )
        magnet = self.Parser.scrape( "magnet_link", fetch_results.content )

        return magnet
@ -71,26 +78,40 @@ class Scraper:
            results_buffer = list()

            for tr in resultsTable_xpath:
-                title = tr.xpath('td[2]/div[1]/a[1]/text()')
-                seeders = tr.xpath('td[3]/text()')[0]
-                leechers = tr.xpath('td[4]/text()')[0]
-                author = tr.xpath('td[2]/font/a/text()')
-                size_unprocessed = tr.xpath('td[2]/font/text()')[0]
-                url = tr.xpath('td/div[@class="detName"]/a[@class="detLink"]/@href')[0]
+                title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )
+                seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )

+                leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )
+                url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )
+
+                size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )

                m = re.search('Size (.+?),', size_unprocessed)

                if m:
                    size = m.group(1)

+                author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )

+
+
+                print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )
                results_buffer.append(
                    Result(title, seeders, leechers, size, author, url)
                )

+            # hack
+            nav = results_buffer.pop()
+
            return results_buffer

+        @staticmethod
+        def scrape_helper( tr, xpathq ):
+            try:
+                val = tr.xpath( xpathq )[0]
+            except IndexError:
+                val = "0"
+            return val

        @staticmethod
        def magnet_link( text ):