updated to reflect TPB site changes and various bug fixes
parent
3725629ada
commit
3cff05035b
|
@ -69,7 +69,7 @@ class Scallywag:
|
||||||
self.searcher = searcher.Scraper( self.config )
|
self.searcher = searcher.Scraper( self.config )
|
||||||
|
|
||||||
results = self.searcher.get_results( search_terms )
|
results = self.searcher.get_results( search_terms )
|
||||||
|
self.status("{0} results found.".format( len(results) ) )
|
||||||
for result in results:
|
for result in results:
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
[list_source]
|
[list_source]
|
||||||
piratebay_proxy_list = thepiratebay-proxylist.se
|
piratebay_proxy_list = https://piratebayproxy.info/
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -9,7 +9,7 @@ class Scraper:
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def get_proxies(self):
|
def get_proxies(self):
|
||||||
fetch_results = self.client.get( "https://" + self.config.proxylist_url )
|
fetch_results = self.client.get( self.config.proxylist_url )
|
||||||
proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content )
|
proxy_list = self.Parser.scrape( "proxy_list", fetch_results.content )
|
||||||
return proxy_list
|
return proxy_list
|
||||||
|
|
||||||
|
@ -24,7 +24,9 @@ class Scraper:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def proxy_list( text ):
|
def proxy_list( text ):
|
||||||
proxyTable = html.fromstring( text )
|
proxyTable = html.fromstring( text )
|
||||||
proxyTable_xpath = proxyTable.xpath( '//table[@class="proxies"]/tbody/tr/@data-domain' )
|
proxyTable_xpath = proxyTable.xpath('//body[@id="mainPage"]/div[@class="container"]/div[@id="content"]/table[@id="searchResult"]/tr/td[@class="site"]/a/text()')
|
||||||
|
for proxy in proxyTable_xpath:
|
||||||
|
print("Available Proxy: {0}".format( proxy ) )
|
||||||
return proxyTable_xpath
|
return proxyTable_xpath
|
||||||
|
|
||||||
class SessionError( Exception ):
|
class SessionError( Exception ):
|
||||||
|
|
|
@ -3,6 +3,7 @@ from lxml import html
|
||||||
import urllib
|
import urllib
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
class Result:
|
class Result:
|
||||||
def __init__(self, title, seeders, leechers, size, author, url):
|
def __init__(self, title, seeders, leechers, size, author, url):
|
||||||
|
@ -32,24 +33,30 @@ class Scraper:
|
||||||
|
|
||||||
def craft_url(self, protocol, proxy, search_terms):
|
def craft_url(self, protocol, proxy, search_terms):
|
||||||
# https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
|
# https://pirate.blue/s/?q=Raising+Arizona&category=0&page=0&orderby=99
|
||||||
f = { 'q': search_terms, 'category': 0, 'page': 0, 'orderby': 99 }
|
f = { 'q': search_terms, 'page': 0, 'orderby': 99 }
|
||||||
url = str.format( "{0}://{1}/s/?{2}", protocol, proxy, urllib.parse.urlencode(f) )
|
# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
|
||||||
|
# https://thepiratebay0.org/s/?page=0&orderby=0&q=inuyasha
|
||||||
|
url = "{0}://{1}/s/?{2}".format( protocol, proxy, urllib.parse.urlencode(f) )
|
||||||
print(url)
|
print(url)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_results(self, search_terms):
|
def get_results(self, search_terms):
|
||||||
url = self.craft_url( "https", self.config.proxy, search_terms )
|
url = self.craft_url( "https", self.config.proxy, search_terms )
|
||||||
|
|
||||||
fetch_results = self.client.get( url )
|
try:
|
||||||
|
fetch_results = self.client.get( url )
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print( e, file=sys.stderr )
|
||||||
results_list = self.Parser.scrape( "results_list", fetch_results.content )
|
results_list = self.Parser.scrape( "results_list", fetch_results.content )
|
||||||
|
|
||||||
return results_list
|
return results_list
|
||||||
|
|
||||||
|
|
||||||
def get_magnet(self, url):
|
def get_magnet(self, url):
|
||||||
url = "https://" + self.config.proxy + url
|
try:
|
||||||
fetch_results = self.client.get(url)
|
fetch_results = self.client.get(url)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print( e, file=sys.stderr )
|
||||||
magnet = self.Parser.scrape( "magnet_link", fetch_results.content )
|
magnet = self.Parser.scrape( "magnet_link", fetch_results.content )
|
||||||
|
|
||||||
return magnet
|
return magnet
|
||||||
|
@ -71,26 +78,40 @@ class Scraper:
|
||||||
results_buffer = list()
|
results_buffer = list()
|
||||||
|
|
||||||
for tr in resultsTable_xpath:
|
for tr in resultsTable_xpath:
|
||||||
title = tr.xpath('td[2]/div[1]/a[1]/text()')
|
title = Scraper.Parser.scrape_helper( tr, 'td[2]/div[1]/a[1]/text()' )
|
||||||
seeders = tr.xpath('td[3]/text()')[0]
|
seeders = Scraper.Parser.scrape_helper( tr, 'td[3]/text()' )
|
||||||
leechers = tr.xpath('td[4]/text()')[0]
|
|
||||||
author = tr.xpath('td[2]/font/a/text()')
|
|
||||||
size_unprocessed = tr.xpath('td[2]/font/text()')[0]
|
|
||||||
url = tr.xpath('td/div[@class="detName"]/a[@class="detLink"]/@href')[0]
|
|
||||||
|
|
||||||
|
leechers = Scraper.Parser.scrape_helper( tr, 'td[4]/text()' )
|
||||||
|
url = Scraper.Parser.scrape_helper( tr, 'td/div[@class="detName"]/a[@class="detLink"]/@href' )
|
||||||
|
|
||||||
|
size_unprocessed = Scraper.Parser.scrape_helper( tr, 'td[2]/font/text()' )
|
||||||
|
|
||||||
m = re.search('Size (.+?),', size_unprocessed)
|
m = re.search('Size (.+?),', size_unprocessed)
|
||||||
|
|
||||||
if m:
|
if m:
|
||||||
size = m.group(1)
|
size = m.group(1)
|
||||||
|
|
||||||
|
author = Scraper.Parser.scrape_helper( tr, 'td[2]/font[@class="detDesc"]/*/text()' )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("Result: {0}".format( Result(title, seeders, leechers, size, author, url) ) )
|
||||||
results_buffer.append(
|
results_buffer.append(
|
||||||
Result(title, seeders, leechers, size, author, url)
|
Result(title, seeders, leechers, size, author, url)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# hack
|
||||||
|
nav = results_buffer.pop()
|
||||||
|
|
||||||
return results_buffer
|
return results_buffer
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def scrape_helper( tr, xpathq ):
|
||||||
|
try:
|
||||||
|
val = tr.xpath( xpathq )[0]
|
||||||
|
except IndexError:
|
||||||
|
val = "0"
|
||||||
|
return val
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def magnet_link( text ):
|
def magnet_link( text ):
|
||||||
|
|
Loading…
Reference in New Issue