Scallywag/rsrc/Scraper/searcher.py

import requests
from lxml import html
import urllib
import re
import json
import sys

class Result:
    def __init__(self, title, seeders, leechers, size, author, url):
        self.title = str(title)
        self.seeders = int(seeders)
        self.leechers = int(leechers)
        self.size = str(size)
        self.author = str(author)
        self.url = str(url)

    def __str__(self):
        myjson = {}
        myjson['title'] = self.title
        myjson['seeders'] = self.seeders
        myjson['leechers'] = self.leechers
        myjson['size'] = self.size
        myjson['author'] = self.author
        myjson['url'] = self.url

        return json.dumps(myjson)

class Scraper:
    def __init__( self, config ):
        # the request client, belongs to session even if no "user session" is needed
        self.client = requests.Session()
        self.config = config

    def craft_url(self, protocol, proxy, search_terms):
        # https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
        f = { 'q': search_terms, 'page': 0, 'orderby': 99 }
        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
        # https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
        url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )
        print(url)
        return url

    def get_results(self, search_terms):
        url = self.craft_url( "https", self.config.proxy, search_terms )

        try:
            fetch_results = self.client.get( url )
        except requests.exceptions.RequestException as e:
            print( e, file=sys.stderr )
        results_list = self.Parser.scrape( "results_list", fetch_results.content )

        return results_list


    def get_magnet(self, url):
        try:
            fetch_results = self.client.get(url)
        except requests.exceptions.RequestException as e:
            print( e, file=sys.stderr )
        magnet = self.Parser.scrape( "magnet_link", fetch_results.content )

        return magnet

    class Parser:
        @staticmethod
        def scrape( datapoint, text ):
            cases = {
                "results_list": Scraper.Parser.results_list,
                "magnet_link": Scraper.Parser.magnet_link
            }
            return cases[ datapoint ]( text )

        @staticmethod
        def results_list( text ):
            resultsTable = html.fromstring( text )
            resultsTable_xpath = resultsTable.xpath( '//table[@id="searchResult"]/tr' )

            results_buffer = list()

            for tr in resultsTable_xpath:
                title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )
                seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )

                leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )
                url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )

                size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )

                m = re.search('Size (.+?),', size_unprocessed)

                if m:
                    size = m.group(1)

                author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )


                print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )
                results_buffer.append(
                    Result(title, seeders, leechers, size, author, url)
                )

            # hack
            nav = results_buffer.pop()

            return results_buffer

        @staticmethod
        def scrape_helper( tr, xpathq ):
            try:
                val = tr.xpath( xpathq )[0]
            except IndexError:
                val = "0"
            return val

        @staticmethod
        def magnet_link( text ):
            link_page = html.fromstring( text )
            magnet_link = link_page.xpath('//div[@class="download"]/a/@href')[0]
            return magnet_link


    class SessionError( Exception ):
        def __init__( self, value ):
            self.value = value
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00			`import requests`
			`from lxml import html`
			`import urllib`
			`import re`
			`import json`
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`import sys`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
			`class Result:`
			`def __init__(self, title, seeders, leechers, size, author, url):`
			`self.title = str(title)`
			`self.seeders = int(seeders)`
			`self.leechers = int(leechers)`
			`self.size = str(size)`
			`self.author = str(author)`
			`self.url = str(url)`

			`def __str__(self):`
			`myjson = {}`
			`myjson['title'] = self.title`
			`myjson['seeders'] = self.seeders`
			`myjson['leechers'] = self.leechers`
			`myjson['size'] = self.size`
			`myjson['author'] = self.author`
			`myjson['url'] = self.url`

			`return json.dumps(myjson)`

			`class Scraper:`
			`def __init__( self, config ):`
			`# the request client, belongs to session even if no "user session" is needed`
			`self.client = requests.Session()`
			`self.config = config`

			`def craft_url(self, protocol, proxy, search_terms):`
			`# https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99`
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`f = { 'q': search_terms, 'page': 0, 'orderby': 99 }`
			`# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha`
			`# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha`
			`url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )`
fixed blocking subprocess launch to xdg-open for magnet link handling 2018-10-08 04:26:41 +00:00			`print(url)`
			`return url`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
			`def get_results(self, search_terms):`
			`url = self.craft_url( "https", self.config.proxy, search_terms )`

updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`try:`
			`fetch_results = self.client.get( url )`
			`except requests.exceptions.RequestException as e:`
			`print( e, file=sys.stderr )`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00			`results_list = self.Parser.scrape( "results_list", fetch_results.content )`

			`return results_list`


			`def get_magnet(self, url):`
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`try:`
			`fetch_results = self.client.get(url)`
			`except requests.exceptions.RequestException as e:`
			`print( e, file=sys.stderr )`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00			`magnet = self.Parser.scrape( "magnet_link", fetch_results.content )`

			`return magnet`

			`class Parser:`
			`@staticmethod`
			`def scrape( datapoint, text ):`
			`cases = {`
			`"results_list": Scraper.Parser.results_list,`
			`"magnet_link": Scraper.Parser.magnet_link`
			`}`
			`return cases[ datapoint ]( text )`

			`@staticmethod`
			`def results_list( text ):`
			`resultsTable = html.fromstring( text )`
			`resultsTable_xpath = resultsTable.xpath( '//table[@id="searchResult"]/tr' )`

			`results_buffer = list()`

			`for tr in resultsTable_xpath:`
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )`
			`seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )`

			`leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )`
			`url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
			`m = re.search('Size (.+?),', size_unprocessed)`

			`if m:`
			`size = m.group(1)`

updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )`

first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00
			`print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00			`results_buffer.append(`
			`Result(title, seeders, leechers, size, author, url)`
			`)`

updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`# hack`
			`nav = results_buffer.pop()`

first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00			`return results_buffer`

updated to reflect TPB site changes and various bug fixes 2020-08-03 03:37:05 +00:00			`@staticmethod`
			`def scrape_helper( tr, xpathq ):`
			`try:`
			`val = tr.xpath( xpathq )[0]`
			`except IndexError:`
			`val = "0"`
			`return val`
first commit, ARGGGGGH! 2018-10-08 03:08:35 +00:00
			`@staticmethod`
			`def magnet_link( text ):`
			`link_page = html.fromstring( text )`
			`magnet_link = link_page.xpath('//div[@class="download"]/a/@href')[0]`
			`return magnet_link`



			`class SessionError( Exception ):`
			`def __init__( self, value ):`
			`self.value = value`