Scallywag/rsrc/Scraper/searcher.py

107 lines
3.2 KiB
Python
Raw Normal View History

2018-10-08 03:08:35 +00:00
import requests
from lxml import html
import urllib
import re
import json
class Result:
def __init__(self, title, seeders, leechers, size, author, url):
self.title = str(title)
self.seeders = int(seeders)
self.leechers = int(leechers)
self.size = str(size)
self.author = str(author)
self.url = str(url)
def __str__(self):
myjson = {}
myjson['title'] = self.title
myjson['seeders'] = self.seeders
myjson['leechers'] = self.leechers
myjson['size'] = self.size
myjson['author'] = self.author
myjson['url'] = self.url
return json.dumps(myjson)
class Scraper:
def __init__( self, config ):
# the request client, belongs to session even if no "user session" is needed
self.client = requests.Session()
self.config = config
def craft_url(self, protocol, proxy, search_terms):
# https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
f = { 'q': search_terms, 'category': 0, 'page': 0, 'orderby': 99 }
url = str.format( "{0}://{1}/s/?{2}", protocol, proxy, urllib.parse.urlencode(f) )
print(url)
return url
2018-10-08 03:08:35 +00:00
def get_results(self, search_terms):
url = self.craft_url( "https", self.config.proxy, search_terms )
fetch_results = self.client.get( url )
results_list = self.Parser.scrape( "results_list", fetch_results.content )
return results_list
def get_magnet(self, url):
url = "https://" + self.config.proxy + url
fetch_results = self.client.get(url)
magnet = self.Parser.scrape( "magnet_link", fetch_results.content )
return magnet
class Parser:
@staticmethod
def scrape( datapoint, text ):
cases = {
"results_list": Scraper.Parser.results_list,
"magnet_link": Scraper.Parser.magnet_link
}
return cases[ datapoint ]( text )
@staticmethod
def results_list( text ):
resultsTable = html.fromstring( text )
resultsTable_xpath = resultsTable.xpath( '//table[@id="searchResult"]/tr' )
results_buffer = list()
for tr in resultsTable_xpath:
title = tr.xpath('td[2]/div[1]/a[1]/text()')
2018-10-08 03:08:35 +00:00
seeders = tr.xpath('td[3]/text()')[0]
leechers = tr.xpath('td[4]/text()')[0]
author = tr.xpath('td[2]/font/a/text()')
2018-10-08 03:08:35 +00:00
size_unprocessed = tr.xpath('td[2]/font/text()')[0]
url = tr.xpath('td/div[@class="detName"]/a[@class="detLink"]/@href')[0]
m = re.search('Size (.+?),', size_unprocessed)
if m:
size = m.group(1)
results_buffer.append(
Result(title, seeders, leechers, size, author, url)
)
return results_buffer
@staticmethod
def magnet_link( text ):
link_page = html.fromstring( text )
magnet_link = link_page.xpath('//div[@class="download"]/a/@href')[0]
return magnet_link
class SessionError( Exception ):
def __init__( self, value ):
self.value = value