diff --git a/headphones/helpers.py b/headphones/helpers.py index 79ac9b85..5f8665cd 100644 --- a/headphones/helpers.py +++ b/headphones/helpers.py @@ -18,8 +18,16 @@ from operator import itemgetter import datetime import re, shutil +from lib.beets.mediafile import MediaFile, FileTypeError, UnreadableFileError + import headphones +# Modified from https://github.com/Verrus/beets-plugin-featInTitle +RE_FEATURING = re.compile(r"[fF]t\.|[fF]eaturing|[fF]eat\.|\b[wW]ith\b|&|vs\.") + +RE_CD_ALBUM = re.compile(r"\(?((CD|disc)\s*[0-9]+)\)", re.I) +RE_CD = re.compile(r"^(CD|dics)\s*[0-9]+$", re.I) + def multikeysort(items, columns): comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns] @@ -209,7 +217,100 @@ def extract_data(s): return (name, album, year) else: return (None, None, None) - + +def extract_metadata(f): + """ + Scan all files in the given directory and decide on an artist, album and + year based on the metadata. A decision is based on the number of different + artists, albums and years found in the media files. + """ + + from headphones import logger + + # Walk directory and scan all media files + results = [] + count = 0 + + for root, dirs, files in os.walk(f): + for file in files: + # Count the number of potential media files + extension = os.path.splitext(file)[1].lower()[1:] + + if extension in headphones.MEDIA_FORMATS: + count += 1 + + # Try to read the file info + try: + media_file = MediaFile(os.path.join(root, file)) + except FileTypeError, UnreadableFileError: + # Probably not a media file + continue + + # Append metadata to file + artist = media_file.albumartist or media_file.artist + album = media_file.album + year = media_file.year + + if artist and album and year: + results.append((artist.lower(), album.lower(), year)) + + # Verify results + if len(results) == 0: + logger.info("No metadata in media files found, ignoring") + return (None, None, None) + + # Require that some percentage of files have tags + count_ratio = 0.75 + + if count < (count_ratio * len(results)): + logger.info("Counted %d media files, but only %d have tags, ignoring" % (count, len(results))) + return (None, None, None) + + # Count distinct values + artists = list(set([ x[0] for x in results ])) + albums = list(set([ x[1] for x in results ])) + years = list(set([ x[2] for x in results ])) + + # Remove things such as CD2 from album names + if len(albums) > 1: + new_albums = list(albums) + + # Replace occurences of e.g. CD1 + for index, album in enumerate(new_albums): + if RE_CD_ALBUM.search(album): + new_albums[index] = RE_CD_ALBUM.sub("", album).strip() + + # Remove duplicates + new_albums = list(set(new_albums)) + + # Safety check: if nothing has merged, then ignore the work. This can + # happen if only one CD of a multi part CD is processed. + if len(new_albums) < len(albums): + albums = new_albums + + # All files have the same metadata, so it's trivial + if len(artists) == 1 and len(albums) == 1 and len(years) == 1: + return (artists[0], albums[0], years[0]) + + # (Lots of) different artists. Could be a featuring album, so test for this. + if len(artists) > 1 and len(albums) == 1 and len(years) == 1: + split_artists = [ RE_FEATURING.split(artist) for artist in artists ] + featurings = [ len(split_artist) - 1 for split_artist in split_artists ] + logger.info("Album seem to feature %d different artists" % sum(featurings)) + + if sum(featurings) > 0: + # Find the artist of which the least splits have been generated. + # Ideally, this should be 0, which should be the album artist + # itself. + artist = split_artists[featurings.index(min(featurings))][0] + + # Done + return (artist, albums[0], years[0]) + + # Not sure what to do here. + logger.info("Found %d artists, %d albums and %d years in metadata, ignoring" % (len(artists), len(albums), len(years))) + return (None, None, None) + def extract_logline(s): # Default log format pattern = re.compile(r'(?P.*?)\s\-\s(?P.*?)\s*\:\:\s(?P.*?)\s\:\s(?P.*)', re.VERBOSE) diff --git a/headphones/postprocessor.py b/headphones/postprocessor.py index 1333936f..bb2e9b49 100644 --- a/headphones/postprocessor.py +++ b/headphones/postprocessor.py @@ -976,11 +976,16 @@ def forcePostProcess(): # TODO: Add metadata lookup try: name, album, year = helpers.extract_data(folder_basename) - except: + + # Try to deduce the name, album and year from the tag info in the + # media files. + if name is None: + name, album, year = helpers.extract_metadata(folder) + except Exception as e: + print e name = None if name and album and year: - release = myDB.action('SELECT AlbumID, ArtistName, AlbumTitle from albums WHERE ArtistName LIKE ? and AlbumTitle LIKE ?', [name, album]).fetchone() if release: logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle']))