From 9811df2779231cb527da5951c0a68ca31ebbc8ab Mon Sep 17 00:00:00 2001 From: rembo10 Date: Thu, 18 Jan 2024 16:01:26 +0530 Subject: [PATCH] Slightly clean up searcher.py, fix regex warnings --- headphones/helpers.py | 7 +++++ headphones/helpers_test.py | 17 ++++++++++- headphones/searcher.py | 61 ++++++++++++++++++++++++-------------- 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/headphones/helpers.py b/headphones/helpers.py index c8f53c64..3a79d345 100644 --- a/headphones/helpers.py +++ b/headphones/helpers.py @@ -1050,3 +1050,10 @@ def have_pct_have_total(db_artist): have_pct = have_tracks / total_tracks if total_tracks else 0 return (have_pct, total_tracks) + +def has_token(title, token): + return bool( + re.search(rf'(?:\W|^)+{token}(?:\W|$)+', + title, + re.IGNORECASE | re.UNICODE) + ) diff --git a/headphones/helpers_test.py b/headphones/helpers_test.py index 16753f91..09a1783c 100644 --- a/headphones/helpers_test.py +++ b/headphones/helpers_test.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from .unittestcompat import TestCase -from headphones.helpers import clean_name, is_valid_date, age +from headphones.helpers import clean_name, is_valid_date, age, has_token class HelpersTest(TestCase): @@ -56,3 +56,18 @@ class HelpersTest(TestCase): ] for input, expected, desc in test_cases: self.assertEqual(is_valid_date(input), expected, desc) + + def test_has_token(self): + """helpers: has_token()""" + self.assertEqual( + has_token("a cat ran", "cat"), + True, + "return True if token is in string" + ) + self.assertEqual( + has_token("acatran", "cat"), + False, + "return False if token is part of another word" + ) + + diff --git a/headphones/searcher.py b/headphones/searcher.py index 93ec607a..869c1084 100644 --- a/headphones/searcher.py +++ b/headphones/searcher.py @@ -37,9 +37,27 @@ from unidecode import unidecode import headphones from headphones.common import USER_AGENT +from headphones.helpers import ( + bytes_to_mb, + has_token, + piratesize, + replace_all, + replace_illegal_chars, + sab_replace_dots, + sab_replace_spaces, + sab_sanitize_foldername, + ) from headphones.types import Result -from headphones import logger, db, helpers, classes, sab, nzbget, request -from headphones import utorrent, transmission, notifiers, rutracker, deluge, qbittorrent, bandcamp +from headphones import logger, db, classes, sab, nzbget, request +from headphones import ( + bandcamp, + deluge, + notifiers, + qbittorrent, + rutracker, + transmission, + utorrent, + ) from bencode import bencode, bdecode # Magnet to torrent services, for Black hole. Stolen from CouchPotato. @@ -137,7 +155,7 @@ def calculate_torrent_hash(link, data=None): """ if link.startswith("magnet:"): - torrent_hash = re.findall("urn:btih:([\w]{32,40})", link)[0] + torrent_hash = re.findall(r"urn:btih:([\w]{32,40})", link)[0] if len(torrent_hash) == 32: torrent_hash = b16encode(b32decode(torrent_hash)).lower() elif data: @@ -553,8 +571,8 @@ def searchNZB(album, new=False, losslessOnly=False, albumlength=None, term = cleanartist + ' ' + cleanalbum # Replace bad characters in the term - term = re.sub('[\.\-\/]', ' ', term) - artistterm = re.sub('[\.\-\/]', ' ', cleanartist) + term = re.sub(r'[\.\-\/]', r' ', term) + artistterm = re.sub(r'[\.\-\/]', r' ', cleanartist) # If Preferred Bitrate and High Limit and Allow Lossless then get both lossy and lossless if headphones.CONFIG.PREFERRED_QUALITY == 2 and headphones.CONFIG.PREFERRED_BITRATE and headphones.CONFIG.PREFERRED_BITRATE_HIGH_BUFFER and headphones.CONFIG.PREFERRED_BITRATE_ALLOW_LOSSLESS: @@ -1172,7 +1190,7 @@ def send_to_downloader(data, result, album): def verifyresult(title, artistterm, term, lossless): - title = re.sub('[\.\-\/\_]', ' ', title) + title = re.sub(r'[\.\-\/\_]', r' ', title) # if artistterm != 'Various Artists': # @@ -1235,23 +1253,23 @@ def verifyresult(title, artistterm, term, lossless): title, each_word) return False - tokens = re.split('\W', term, re.IGNORECASE | re.UNICODE) + tokens = re.split(r'\W', term, re.IGNORECASE | re.UNICODE) + for token in tokens: if not token: continue if token == 'Various' or token == 'Artists' or token == 'VA': continue - if not re.search('(?:\W|^)+' + token + '(?:\W|$)+', title, re.IGNORECASE | re.UNICODE): + if not has_token(title, token): cleantoken = ''.join(c for c in token if c not in string.punctuation) - if not not re.search('(?:\W|^)+' + cleantoken + '(?:\W|$)+', title, - re.IGNORECASE | re.UNICODE): + if not has_token(title, cleantoken): dic = {'!': 'i', '$': 's'} dumbtoken = helpers.replace_all(token, dic) - if not not re.search('(?:\W|^)+' + dumbtoken + '(?:\W|$)+', title, - re.IGNORECASE | re.UNICODE): - logger.info("Removed from results: %s (missing tokens: %s and %s)", title, - token, cleantoken) + if not has_token(title, dumbtoken): + logger.info( + "Removed from results: %s (missing tokens: [%s, %s, %s])", + title, token, cleantoken, dumbtoken) return False return True @@ -1309,12 +1327,12 @@ def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, else: usersearchterm = '' - semi_clean_artist_term = re.sub('[\.\-\/]', ' ', semi_cleanartist) - semi_clean_album_term = re.sub('[\.\-\/]', ' ', semi_cleanalbum) + semi_clean_artist_term = re.sub(r'[\.\-\/]', r' ', semi_cleanartist) + semi_clean_album_term = re.sub(r'[\.\-\/]', r' ', semi_cleanalbum) # Replace bad characters in the term - term = re.sub('[\.\-\/]', ' ', term) - artistterm = re.sub('[\.\-\/]', ' ', cleanartist) - albumterm = re.sub('[\.\-\/]', ' ', cleanalbum) + term = re.sub(r'[\.\-\/]', r' ', term) + artistterm = re.sub(r'[\.\-\/]', r' ', cleanartist) + albumterm = re.sub(r'[\.\-\/]', r' ', cleanalbum) # If Preferred Bitrate and High Limit and Allow Lossless then get both lossy and lossless if headphones.CONFIG.PREFERRED_QUALITY == 2 and headphones.CONFIG.PREFERRED_BITRATE and headphones.CONFIG.PREFERRED_BITRATE_HIGH_BUFFER and headphones.CONFIG.PREFERRED_BITRATE_ALLOW_LOSSLESS: @@ -1927,15 +1945,14 @@ def preprocess(resultlist): if result[4] == 'bandcamp': return True, result - if result[4] == 'torrent': - - if result.provider in ["The Pirate Bay", "Old Pirate Bay"]: + if result[4] == 'torrent' and result.provider in ["The Pirate Bay", "Old Pirate Bay"]: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/41.0.2243.2 Safari/537.36' } + else: headers = {'User-Agent': USER_AGENT}