diff --git a/headphones/helpers.py b/headphones/helpers.py index 79ac9b85..5f8665cd 100644 --- a/headphones/helpers.py +++ b/headphones/helpers.py @@ -18,8 +18,16 @@ from operator import itemgetter import datetime import re, shutil +from lib.beets.mediafile import MediaFile, FileTypeError, UnreadableFileError + import headphones +# Modified from https://github.com/Verrus/beets-plugin-featInTitle +RE_FEATURING = re.compile(r"[fF]t\.|[fF]eaturing|[fF]eat\.|\b[wW]ith\b|&|vs\.") + +RE_CD_ALBUM = re.compile(r"\(?((CD|disc)\s*[0-9]+)\)", re.I) +RE_CD = re.compile(r"^(CD|dics)\s*[0-9]+$", re.I) + def multikeysort(items, columns): comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns] @@ -209,7 +217,100 @@ def extract_data(s): return (name, album, year) else: return (None, None, None) - + +def extract_metadata(f): + """ + Scan all files in the given directory and decide on an artist, album and + year based on the metadata. A decision is based on the number of different + artists, albums and years found in the media files. + """ + + from headphones import logger + + # Walk directory and scan all media files + results = [] + count = 0 + + for root, dirs, files in os.walk(f): + for file in files: + # Count the number of potential media files + extension = os.path.splitext(file)[1].lower()[1:] + + if extension in headphones.MEDIA_FORMATS: + count += 1 + + # Try to read the file info + try: + media_file = MediaFile(os.path.join(root, file)) + except FileTypeError, UnreadableFileError: + # Probably not a media file + continue + + # Append metadata to file + artist = media_file.albumartist or media_file.artist + album = media_file.album + year = media_file.year + + if artist and album and year: + results.append((artist.lower(), album.lower(), year)) + + # Verify results + if len(results) == 0: + logger.info("No metadata in media files found, ignoring") + return (None, None, None) + + # Require that some percentage of files have tags + count_ratio = 0.75 + + if count < (count_ratio * len(results)): + logger.info("Counted %d media files, but only %d have tags, ignoring" % (count, len(results))) + return (None, None, None) + + # Count distinct values + artists = list(set([ x[0] for x in results ])) + albums = list(set([ x[1] for x in results ])) + years = list(set([ x[2] for x in results ])) + + # Remove things such as CD2 from album names + if len(albums) > 1: + new_albums = list(albums) + + # Replace occurences of e.g. CD1 + for index, album in enumerate(new_albums): + if RE_CD_ALBUM.search(album): + new_albums[index] = RE_CD_ALBUM.sub("", album).strip() + + # Remove duplicates + new_albums = list(set(new_albums)) + + # Safety check: if nothing has merged, then ignore the work. This can + # happen if only one CD of a multi part CD is processed. + if len(new_albums) < len(albums): + albums = new_albums + + # All files have the same metadata, so it's trivial + if len(artists) == 1 and len(albums) == 1 and len(years) == 1: + return (artists[0], albums[0], years[0]) + + # (Lots of) different artists. Could be a featuring album, so test for this. + if len(artists) > 1 and len(albums) == 1 and len(years) == 1: + split_artists = [ RE_FEATURING.split(artist) for artist in artists ] + featurings = [ len(split_artist) - 1 for split_artist in split_artists ] + logger.info("Album seem to feature %d different artists" % sum(featurings)) + + if sum(featurings) > 0: + # Find the artist of which the least splits have been generated. + # Ideally, this should be 0, which should be the album artist + # itself. + artist = split_artists[featurings.index(min(featurings))][0] + + # Done + return (artist, albums[0], years[0]) + + # Not sure what to do here. + logger.info("Found %d artists, %d albums and %d years in metadata, ignoring" % (len(artists), len(albums), len(years))) + return (None, None, None) + def extract_logline(s): # Default log format pattern = re.compile(r'(?P.*?)\s\-\s(?P.*?)\s*\:\:\s(?P.*?)\s\:\s(?P.*)', re.VERBOSE) diff --git a/headphones/postprocessor.py b/headphones/postprocessor.py index 31eb1d3e..0de3a75a 100644 --- a/headphones/postprocessor.py +++ b/headphones/postprocessor.py @@ -954,15 +954,17 @@ def forcePostProcess(): myDB = db.DBConnection() for folder in folders: - folder_basename = os.path.basename(folder).decode(headphones.SYS_ENCODING, 'replace') - logger.info('Processing: %s' % folder_basename) - - # First try to see if there's a match in the snatched table, then we'll try to parse the foldername - # TODO: Iterate through underscores -> spaces, spaces -> dots, underscores -> dots (this might be hit or miss since it assumes - # all spaces/underscores came from sab replacing values + + # Attempt 1: First try to see if there's a match in the snatched table, + # then we'll try to parse the foldername. + # TODO: Iterate through underscores -> spaces, spaces -> dots, + # underscores -> dots (this might be hit or miss since it assumes all + # spaces/underscores came from sab replacing values + logger.debug('Attempting to find album in the snatched table') snatched = myDB.action('SELECT AlbumID, Title, Kind, Status from snatched WHERE FolderName LIKE ?', [folder_basename]).fetchone() + if snatched: if headphones.KEEP_TORRENT_FILES and snatched['Kind'] == 'torrent' and snatched['Status'] == 'Processed': logger.info(folder_basename + ' is a torrent folder being preserved for seeding and has already been processed. Skipping.') @@ -971,16 +973,23 @@ def forcePostProcess(): logger.info('Found a match in the database: %s. Verifying to make sure it is the correct album' % snatched['Title']) verify(snatched['AlbumID'], folder, snatched['Kind']) continue - - # Try to parse the folder name into a valid format - # TODO: Add metadata lookup + + # Attempt 2a: parse the folder name into a valid format try: + logger.debug('Attempting to extract name, album and year from folder name') name, album, year = helpers.extract_data(folder_basename) - except: + except Exception as e: name = None + # Attempt 2b: deduce meta data into a valid format + if name is None: + try: + logger.debug('Attempting to extract name, album and year from metadata') + name, album, year = helpers.extract_metadata(folder) + except Exception as e: + name = None + if name and album and year: - release = myDB.action('SELECT AlbumID, ArtistName, AlbumTitle from albums WHERE ArtistName LIKE ? and AlbumTitle LIKE ?', [name, album]).fetchone() if release: logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle'])) @@ -992,30 +1001,28 @@ def forcePostProcess(): rgid = mb.findAlbumID(helpers.latinToAscii(name), helpers.latinToAscii(album)) except: logger.error('Can not get release information for this album') - continue if rgid: verify(rgid, folder) + continue else: logger.info('No match found on MusicBrainz for: %s - %s' % (name, album)) - continue - - else: - try: - possible_rgid = folder_basename[-36:] - # re pattern match: [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} - rgid = uuid.UUID(possible_rgid) - - except: - logger.info("Couldn't parse " + folder_basename + " into any valid format. If adding albums from another source, they must be in an 'Artist - Album [Year]' format, or end with the musicbrainz release group id") - continue - - - if rgid: - rgid = possible_rgid - release = myDB.action('SELECT ArtistName, AlbumTitle, AlbumID from albums WHERE AlbumID=?', [rgid]).fetchone() - if release: - logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle'])) - verify(release['AlbumID'], folder, forced=True) - else: - logger.info('Found a (possibly) valid Musicbrainz identifier in album folder name - continuing post-processing') - verify(rgid, folder, forced=True) + + # Attempt 3: strip release group id from filename + try: + logger.debug('Attempting to extract release group from folder name') + possible_rgid = folder_basename[-36:] + # re pattern match: [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} + rgid = uuid.UUID(possible_rgid) + except: + logger.info("Couldn't parse " + folder_basename + " into any valid format. If adding albums from another source, they must be in an 'Artist - Album [Year]' format, or end with the musicbrainz release group id") + rgid = None + + if rgid: + rgid = possible_rgid + release = myDB.action('SELECT ArtistName, AlbumTitle, AlbumID from albums WHERE AlbumID=?', [rgid]).fetchone() + if release: + logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle'])) + verify(release['AlbumID'], folder, forced=True) + else: + logger.info('Found a (possibly) valid Musicbrainz identifier in album folder name - continuing post-processing') + verify(rgid, folder, forced=True)