Files
headphones/headphones/searcher.py

513 lines
21 KiB
Python

import urllib, urllib2
import lib.feedparser as feedparser
from xml.dom import minidom
from xml.parsers.expat import ExpatError
import os, re, time
import headphones, exceptions
from headphones import logger, db, helpers, classes, sab
class NewzbinDownloader(urllib.FancyURLopener):
def __init__(self):
urllib.FancyURLopener.__init__(self)
def http_error_default(self, url, fp, errcode, errmsg, headers):
# if newzbin is throttling us, wait seconds and try again
if errcode == 400:
newzbinErrCode = int(headers.getheader('X-DNZB-RCode'))
if newzbinErrCode == 450:
rtext = str(headers.getheader('X-DNZB-RText'))
result = re.search("wait (\d+) seconds", rtext)
logger.info("Newzbin throttled our NZB downloading, pausing for " + result.group(1) + " seconds")
time.sleep(int(result.group(1)))
raise exceptions.NewzbinAPIThrottled()
elif newzbinErrCode == 401:
logger.info("Newzbin error 401")
#raise exceptions.AuthException("Newzbin username or password incorrect")
elif newzbinErrCode == 402:
#raise exceptions.AuthException("Newzbin account not premium status, can't download NZBs")
logger.info("Newzbin error 402")
#this should be in a class somewhere
def getNewzbinURL(url):
myOpener = classes.AuthURLOpener(headphones.NEWZBIN_UID, headphones.NEWZBIN_PASSWORD)
try:
f = myOpener.openit(url)
except (urllib.ContentTooShortError, IOError), e:
logger.info("Error loading search results: ContentTooShortError ")
return None
data = f.read()
f.close()
return data
def searchNZB(albumid=None, new=False):
myDB = db.DBConnection()
if albumid:
results = myDB.select('SELECT ArtistName, AlbumTitle, AlbumID, ReleaseDate from albums WHERE Status="Wanted" AND AlbumID=?', [albumid])
else:
results = myDB.select('SELECT ArtistName, AlbumTitle, AlbumID, ReleaseDate from albums WHERE Status="Wanted"')
new = True
for albums in results:
albumid = albums[2]
reldate = albums[3]
try:
year = reldate[:4]
except TypeError:
year = ''
dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''}
cleanalbum = helpers.latinToAscii(helpers.replace_all(albums[1], dic))
cleanartist = helpers.latinToAscii(helpers.replace_all(albums[0], dic))
# FLAC usually doesn't have a year for some reason so I'll leave it out
# Various Artist albums might be listed as VA, so I'll leave that out too
# Only use the year if the term could return a bunch of different albums, i.e. self-titled albums
if albums[0] in albums[1] or len(albums[0]) < 4 or len(albums[1]) < 4:
term = cleanartist + ' ' + cleanalbum + ' ' + year
elif albums[0] == 'Various Artists':
term = cleanalbum + ' ' + year
else:
term = cleanartist + ' ' + cleanalbum
# Replace bad characters in the term and unicode it
term = re.sub('[\.\-\/]', ' ', term).encode('utf-8')
artistterm = re.sub('[\.\-\/]', ' ', cleanartist).encode('utf-8')
logger.info("Searching for %s since it was marked as wanted" % term)
resultlist = []
if headphones.NZBMATRIX:
provider = "nzbmatrix"
if headphones.PREFERRED_QUALITY == 3:
categories = "23"
maxsize = 10000000000
elif headphones.PREFERRED_QUALITY:
categories = "23,22"
maxsize = 2000000000
else:
categories = "22"
maxsize = 300000000
params = { "page": "download",
"username": headphones.NZBMATRIX_USERNAME,
"apikey": headphones.NZBMATRIX_APIKEY,
"subcat": categories,
"age": headphones.USENET_RETENTION,
"english": 1,
"ssl": 1,
"scenename": 1,
"term": term
}
searchURL = "http://rss.nzbmatrix.com/rss.php?" + urllib.urlencode(params)
logger.info(u'Parsing results from <a href="%s">NZBMatrix</a>' % searchURL)
try:
data = urllib2.urlopen(searchURL, timeout=20).read()
except urllib2.URLError, e:
logger.warn('Error fetching data from NZBMatrix: %s' % e)
data = False
if data:
d = feedparser.parse(data)
for item in d.entries:
try:
url = item.link
title = item.title
size = int(item.links[1]['length'])
if size < maxsize:
resultlist.append((title, size, url, provider))
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
else:
logger.info('%s is larger than the maxsize for this category, skipping. (Size: %i bytes)' % (title, size))
except AttributeError, e:
logger.info(u"No results found from NZBMatrix for %s" % term)
if headphones.NEWZNAB:
provider = "newznab"
if headphones.PREFERRED_QUALITY == 3:
categories = "3040"
maxsize = 10000000000
elif headphones.PREFERRED_QUALITY:
categories = "3040,3010"
maxsize = 2000000000
else:
categories = "3010"
maxsize = 300000000
params = { "t": "search",
"apikey": headphones.NEWZNAB_APIKEY,
"cat": categories,
"maxage": headphones.USENET_RETENTION,
"q": term
}
searchURL = headphones.NEWZNAB_HOST + '/api?' + urllib.urlencode(params)
logger.info(u'Parsing results from <a href="%s">%s</a>' % (searchURL, headphones.NEWZNAB_HOST))
try:
data = urllib2.urlopen(searchURL, timeout=20).read()
except urllib2.URLError, e:
logger.warn('Error fetching data from %s: %s' % (headphones.NEWZNAB_HOST, e))
data = False
if data:
d = feedparser.parse(data)
if not len(d.entries):
logger.info(u"No results found from %s for %s" % (headphones.NEWZNAB_HOST, term))
pass
else:
for item in d.entries:
try:
url = item.link
title = item.title
size = int(item.links[1]['length'])
if size < maxsize:
resultlist.append((title, size, url, provider))
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
else:
logger.info('%s is larger than the maxsize for this category, skipping. (Size: %i bytes)' % (title, size))
except Exception, e:
logger.error(u"An unknown error occured trying to parse the feed: %s" % e)
if headphones.NZBSORG:
provider = "nzbsorg"
if headphones.PREFERRED_QUALITY == 3:
categories = "5"
maxsize = 10000000000
term = term + ' flac'
elif headphones.PREFERRED_QUALITY:
categories = "5"
maxsize = 2000000000
else:
categories = "5"
maxsize = 300000000
params = { "action": "search",
"dl": 1,
"catid": categories,
"i": headphones.NZBSORG_UID,
"h": headphones.NZBSORG_HASH,
"age": headphones.USENET_RETENTION,
"q": term
}
searchURL = 'https://secure.nzbs.org/rss.php?' + urllib.urlencode(params)
logger.info(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider))
try:
data = urllib2.urlopen(searchURL, timeout=20).read()
except urllib2.URLError, e:
logger.warn('Error fetching data from NZBs.org: %s' % e)
data = False
items = False
if data:
try:
d = minidom.parseString(data)
node = d.documentElement
items = d.getElementsByTagName("item")
except ExpatError:
logger.error('Unable to get the NZBs.org feed. Check that your settings are correct - post a bug if they are')
items = None
if items:
for item in items:
sizenode = item.getElementsByTagName("report:size")[0].childNodes
titlenode = item.getElementsByTagName("title")[0].childNodes
linknode = item.getElementsByTagName("link")[0].childNodes
for node in sizenode:
size = int(node.data)
for node in titlenode:
title = node.data
for node in linknode:
url = node.data
if size < maxsize:
resultlist.append((title, size, url, provider))
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
else:
logger.info('%s is larger than the maxsize for this category, skipping. (Size: %i bytes)' % (title, size))
else:
logger.info('No results found from NZBs.org for %s' % term)
if headphones.NEWZBIN:
provider = "newzbin"
providerurl = "https://www.newzbin.com/"
if headphones.PREFERRED_QUALITY == 3:
categories = "7" #music
format = "2" #flac
maxsize = 10000000000
elif headphones.PREFERRED_QUALITY:
categories = "7" #music
format = "10" #mp3+flac
maxsize = 2000000000
else:
categories = "7" #music
format = "8" #mp3
maxsize = 300000000
params = {
"fpn": "p",
'u_nfo_posts_only': 0,
'u_url_posts_only': 0,
'u_comment_posts_only': 0,
'u_show_passworded': 0,
"searchaction": "Search",
#"dl": 1,
"category": categories,
"retention": headphones.USENET_RETENTION,
"ps_rb_audio_format": format,
"feed": "rss",
"u_post_results_amt": 50, #this can default to a high number per user
"hauth": 1,
"q": term
}
searchURL = providerurl + "search/?%s" % urllib.urlencode(params)
try:
data = getNewzbinURL(searchURL)
except exceptions.NewzbinAPIThrottled:
#try again if we were throttled
data = getNewzbinURL(searchURL)
if data:
logger.info(u'Parsing results from <a href="%s">%s</a>' % (searchURL, providerurl))
try:
d = minidom.parseString(data)
node = d.documentElement
items = d.getElementsByTagName("item")
except ExpatError:
logger.info('Unable to get the NEWZBIN feed. Check that your settings are correct - post a bug if they are')
items = []
if len(items):
for item in items:
sizenode = item.getElementsByTagName("report:size")[0].childNodes
titlenode = item.getElementsByTagName("title")[0].childNodes
linknode = item.getElementsByTagName("link")[0].childNodes
for node in sizenode:
size = int(node.data)
for node in titlenode:
title = node.data
for node in linknode:
url = node.data
#exract the reportid from the link nodes
id_regex = re.escape(providerurl) + 'browse/post/(\d+)/'
id_match = re.match(id_regex, url)
if not id_match:
logger.info("Didn't find a valid Newzbin reportid in linknode")
else:
url = id_match.group(1) #we have to make a post request later, need the id
if size < maxsize and url:
resultlist.append((title, size, url, provider))
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
else:
logger.info('%s is larger than the maxsize for this category, skipping. (Size: %i bytes)' % (title, size))
else:
logger.info('No results found from NEWZBIN for %s' % term)
#attempt to verify that this isn't a substring result
#when looking for "Foo - Foo" we don't want "Foobar"
#this should be less of an issue when it isn't a self-titled album so we'll only check vs artist
if len(resultlist):
resultlist[:] = [result for result in resultlist if verifyresult(result[0], artistterm, term)]
if len(resultlist):
if headphones.PREFERRED_QUALITY == 2 and headphones.PREFERRED_BITRATE:
logger.debug('Target bitrate: %s kbps' % headphones.PREFERRED_BITRATE)
tracks = myDB.select('SELECT TrackDuration from tracks WHERE AlbumID=?', [albumid])
try:
albumlength = sum([pair[0] for pair in tracks])
targetsize = albumlength/1000 * int(headphones.PREFERRED_BITRATE) * 128
logger.info('Target size: %s' % helpers.bytes_to_mb(targetsize))
newlist = []
for result in resultlist:
delta = abs(targetsize - result[1])
newlist.append((result[0], result[1], result[2], delta))
nzblist = sorted(newlist, key=lambda title: title[3])
except Exception, e:
logger.debug('Error: %s' % str(e))
logger.info('No track information for %s - %s. Defaulting to highest quality' % (albums[0], albums[1]))
nzblist = sorted(resultlist, key=lambda title: title[1], reverse=True)
else:
nzblist = sorted(resultlist, key=lambda title: title[1], reverse=True)
if new:
# Checks to see if it's already downloaded
i = 0
while i < len(nzblist):
alreadydownloaded = myDB.select('SELECT * from snatched WHERE URL=?', [nzblist[i][2]])
if len(alreadydownloaded) >= 1:
logger.info('%s has already been downloaded. Skipping.' % nzblist[i][0])
i += 1
else:
bestqual = nzblist[i]
break
try:
x = bestqual[0]
except UnboundLocalError:
logger.info('No more matches for %s' % term)
return
else:
bestqual = nzblist[0]
logger.info(u'Found best result: <a href="%s">%s</a> - %s' % (bestqual[2], bestqual[0], helpers.bytes_to_mb(bestqual[1])))
logger.info(u"Pre-processing result")
(data, bestqual) = preprocess(nzblist)
if data and bestqual:
nzb_folder_name = '%s - %s [%s]' % (helpers.latinToAscii(albums[0]).encode('UTF-8').replace('/', '_'), helpers.latinToAscii(albums[1]).encode('UTF-8').replace('/', '_'), year)
if headphones.SAB_HOST and not headphones.BLACKHOLE:
nzb = classes.NZBDataSearchResult()
nzb.extraInfo.append(data)
nzb.name = nzb_folder_name
sab.sendNZB(nzb)
elif headphones.BLACKHOLE:
nzb_name = nzb_folder_name + '.nzb'
download_path = os.path.join(headphones.BLACKHOLE_DIR, nzb_name)
try:
f = open(download_path, 'w')
f.write(data)
f.close()
logger.info('File saved to: %s' % nzb_name)
except Exception, e:
logger.error('Couldn\'t write NZB file: %s' % e)
break
myDB.action('UPDATE albums SET status = "Snatched" WHERE AlbumID=?', [albums[2]])
myDB.action('INSERT INTO snatched VALUES( ?, ?, ?, ?, DATETIME("NOW", "localtime"), ?, ?)', [albums[2], bestqual[0], bestqual[1], bestqual[2], "Snatched", nzb_folder_name])
def verifyresult(title, artistterm, term):
title = re.sub('[\.\-\/\_]', ' ', title)
if artistterm != 'Various Artists':
if not re.search('^' + re.escape(artistterm), title, re.IGNORECASE):
logger.info("Removed from results: " + title + " (artist not at string start).")
return False
elif re.search(re.escape(artistterm) + '\w', title, re.IGNORECASE | re.UNICODE):
logger.info("Removed from results: " + title + " (post substring result).")
return False
elif re.search('\w' + re.escape(artistterm), title, re.IGNORECASE | re.UNICODE):
logger.info("Removed from results: " + title + " (pre substring result).")
return False
#another attempt to weed out substrings. We don't want "Vol III" when we were looking for "Vol II"
tokens = re.split('\W', term, re.IGNORECASE | re.UNICODE)
for token in tokens:
if token == 'Various' or token == 'Artists' or token == 'VA':
continue
if not re.search('(?:\W|^)+' + token + '(?:\W|$)+', title, re.IGNORECASE | re.UNICODE):
logger.info("Removed from results: " + title + " (missing token: " + token + ")")
return False
return True
def getresultNZB(result):
nzb = None
if result[3] == 'newzbin':
params = urllib.urlencode({"username": headphones.NEWZBIN_UID, "password": headphones.NEWZBIN_PASSWORD, "reportid": result[2]})
url = "https://www.newzbin.com" + "/api/dnzb/"
urllib._urlopener = NewzbinDownloader()
try:
nzb = urllib.urlopen(url, data=params).read()
except urllib2.URLError, e:
logger.warn('Error fetching nzb from url: %s. Error: %s' % (url, e))
except exceptions.NewzbinAPIThrottled:
#TODO: This has created a potentially infinite loop? As long as they keep throttling we keep trying.
logger.info("Done waiting for Newzbin API throttle limit, starting downloads again")
getresultNZB(result)
except AttributeError:
logger.warn("AttributeError in getresultNZB.")
else:
try:
nzb = urllib2.urlopen(result[2], timeout=30).read()
except urllib2.URLError, e:
logger.warn('Error fetching nzb from url: ' + result[2] + ' %s' % e)
return nzb
def preprocess(resultlist):
if not headphones.USENET_RETENTION:
usenet_retention = 2000
else:
usenet_retention = int(headphones.USENET_RETENTION)
for result in resultlist:
nzb = getresultNZB(result)
if nzb:
try:
d = minidom.parseString(nzb)
node = d.documentElement
nzbfiles = d.getElementsByTagName("file")
for nzbfile in nzbfiles:
if nzbfile.getAttribute("date") < (time.time() - usenet_retention * 86400):
logger.error('NZB contains a file out of your retention. Skipping.')
continue
#TODO: Do we want rar checking in here to try to keep unknowns out?
#or at least the option to do so?
except ExpatError:
logger.error('Unable to parse the best result NZB. Skipping.')
continue
return nzb, result
else:
logger.error("Couldn't retrieve the best nzb. Skipping.")
return (False, False)