Scallywag/rsrc/Scraper/searcher.py

128 lines
4.0 KiB
Python
Raw Normal View History

2018-10-08 03:08:35 +00:00
import requests
from lxml import html
import urllib
import re
import json
import sys
2018-10-08 03:08:35 +00:00
class Result:
def __init__(self, title, seeders, leechers, size, author, url):
self.title = str(title)
self.seeders = int(seeders)
self.leechers = int(leechers)
self.size = str(size)
self.author = str(author)
self.url = str(url)
def __str__(self):
myjson = {}
myjson['title'] = self.title
myjson['seeders'] = self.seeders
myjson['leechers'] = self.leechers
myjson['size'] = self.size
myjson['author'] = self.author
myjson['url'] = self.url
return json.dumps(myjson)
class Scraper:
def __init__( self, config ):
# the request client, belongs to session even if no "user session" is needed
self.client = requests.Session()
self.config = config
def craft_url(self, protocol, proxy, search_terms):
# https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
f = { 'q': search_terms, 'page': 0, 'orderby': 99 }
# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )
print(url)
return url
2018-10-08 03:08:35 +00:00
def get_results(self, search_terms):
url = self.craft_url( "https", self.config.proxy, search_terms )
try:
fetch_results = self.client.get( url )
except requests.exceptions.RequestException as e:
print( e, file=sys.stderr )
2018-10-08 03:08:35 +00:00
results_list = self.Parser.scrape( "results_list", fetch_results.content )
return results_list
def get_magnet(self, url):
try:
fetch_results = self.client.get(url)
except requests.exceptions.RequestException as e:
print( e, file=sys.stderr )
2018-10-08 03:08:35 +00:00
magnet = self.Parser.scrape( "magnet_link", fetch_results.content )
return magnet
class Parser:
@staticmethod
def scrape( datapoint, text ):
cases = {
"results_list": Scraper.Parser.results_list,
"magnet_link": Scraper.Parser.magnet_link
}
return cases[ datapoint ]( text )
@staticmethod
def results_list( text ):
resultsTable = html.fromstring( text )
resultsTable_xpath = resultsTable.xpath( '//table[@id="searchResult"]/tr' )
results_buffer = list()
for tr in resultsTable_xpath:
title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )
seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )
leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )
url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )
2018-10-08 03:08:35 +00:00
size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )
2018-10-08 03:08:35 +00:00
m = re.search('Size (.+?),', size_unprocessed)
if m:
size = m.group(1)
author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )
2018-10-08 03:08:35 +00:00
print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )
2018-10-08 03:08:35 +00:00
results_buffer.append(
Result(title, seeders, leechers, size, author, url)
)
# hack
nav = results_buffer.pop()
2018-10-08 03:08:35 +00:00
return results_buffer
@staticmethod
def scrape_helper( tr, xpathq ):
try:
val = tr.xpath( xpathq )[0]
except IndexError:
val = "0"
return val
2018-10-08 03:08:35 +00:00
@staticmethod
def magnet_link( text ):
link_page = html.fromstring( text )
magnet_link = link_page.xpath('//div[@class="download"]/a/@href')[0]
return magnet_link
class SessionError( Exception ):
def __init__( self, value ):
self.value = value