Merge remote-tracking branch 'basilfx/postprocess_improvements' into develop

This commit is contained in:
rembo10
2014-03-31 16:58:20 -07:00
2 changed files with 143 additions and 35 deletions

View File

@@ -18,8 +18,16 @@ from operator import itemgetter
import datetime
import re, shutil
from lib.beets.mediafile import MediaFile, FileTypeError, UnreadableFileError
import headphones
# Modified from https://github.com/Verrus/beets-plugin-featInTitle
RE_FEATURING = re.compile(r"[fF]t\.|[fF]eaturing|[fF]eat\.|\b[wW]ith\b|&|vs\.")
RE_CD_ALBUM = re.compile(r"\(?((CD|disc)\s*[0-9]+)\)", re.I)
RE_CD = re.compile(r"^(CD|dics)\s*[0-9]+$", re.I)
def multikeysort(items, columns):
comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns]
@@ -209,7 +217,100 @@ def extract_data(s):
return (name, album, year)
else:
return (None, None, None)
def extract_metadata(f):
"""
Scan all files in the given directory and decide on an artist, album and
year based on the metadata. A decision is based on the number of different
artists, albums and years found in the media files.
"""
from headphones import logger
# Walk directory and scan all media files
results = []
count = 0
for root, dirs, files in os.walk(f):
for file in files:
# Count the number of potential media files
extension = os.path.splitext(file)[1].lower()[1:]
if extension in headphones.MEDIA_FORMATS:
count += 1
# Try to read the file info
try:
media_file = MediaFile(os.path.join(root, file))
except FileTypeError, UnreadableFileError:
# Probably not a media file
continue
# Append metadata to file
artist = media_file.albumartist or media_file.artist
album = media_file.album
year = media_file.year
if artist and album and year:
results.append((artist.lower(), album.lower(), year))
# Verify results
if len(results) == 0:
logger.info("No metadata in media files found, ignoring")
return (None, None, None)
# Require that some percentage of files have tags
count_ratio = 0.75
if count < (count_ratio * len(results)):
logger.info("Counted %d media files, but only %d have tags, ignoring" % (count, len(results)))
return (None, None, None)
# Count distinct values
artists = list(set([ x[0] for x in results ]))
albums = list(set([ x[1] for x in results ]))
years = list(set([ x[2] for x in results ]))
# Remove things such as CD2 from album names
if len(albums) > 1:
new_albums = list(albums)
# Replace occurences of e.g. CD1
for index, album in enumerate(new_albums):
if RE_CD_ALBUM.search(album):
new_albums[index] = RE_CD_ALBUM.sub("", album).strip()
# Remove duplicates
new_albums = list(set(new_albums))
# Safety check: if nothing has merged, then ignore the work. This can
# happen if only one CD of a multi part CD is processed.
if len(new_albums) < len(albums):
albums = new_albums
# All files have the same metadata, so it's trivial
if len(artists) == 1 and len(albums) == 1 and len(years) == 1:
return (artists[0], albums[0], years[0])
# (Lots of) different artists. Could be a featuring album, so test for this.
if len(artists) > 1 and len(albums) == 1 and len(years) == 1:
split_artists = [ RE_FEATURING.split(artist) for artist in artists ]
featurings = [ len(split_artist) - 1 for split_artist in split_artists ]
logger.info("Album seem to feature %d different artists" % sum(featurings))
if sum(featurings) > 0:
# Find the artist of which the least splits have been generated.
# Ideally, this should be 0, which should be the album artist
# itself.
artist = split_artists[featurings.index(min(featurings))][0]
# Done
return (artist, albums[0], years[0])
# Not sure what to do here.
logger.info("Found %d artists, %d albums and %d years in metadata, ignoring" % (len(artists), len(albums), len(years)))
return (None, None, None)
def extract_logline(s):
# Default log format
pattern = re.compile(r'(?P<timestamp>.*?)\s\-\s(?P<level>.*?)\s*\:\:\s(?P<thread>.*?)\s\:\s(?P<message>.*)', re.VERBOSE)

View File

@@ -954,15 +954,17 @@ def forcePostProcess():
myDB = db.DBConnection()
for folder in folders:
folder_basename = os.path.basename(folder).decode(headphones.SYS_ENCODING, 'replace')
logger.info('Processing: %s' % folder_basename)
# First try to see if there's a match in the snatched table, then we'll try to parse the foldername
# TODO: Iterate through underscores -> spaces, spaces -> dots, underscores -> dots (this might be hit or miss since it assumes
# all spaces/underscores came from sab replacing values
# Attempt 1: First try to see if there's a match in the snatched table,
# then we'll try to parse the foldername.
# TODO: Iterate through underscores -> spaces, spaces -> dots,
# underscores -> dots (this might be hit or miss since it assumes all
# spaces/underscores came from sab replacing values
logger.debug('Attempting to find album in the snatched table')
snatched = myDB.action('SELECT AlbumID, Title, Kind, Status from snatched WHERE FolderName LIKE ?', [folder_basename]).fetchone()
if snatched:
if headphones.KEEP_TORRENT_FILES and snatched['Kind'] == 'torrent' and snatched['Status'] == 'Processed':
logger.info(folder_basename + ' is a torrent folder being preserved for seeding and has already been processed. Skipping.')
@@ -971,16 +973,23 @@ def forcePostProcess():
logger.info('Found a match in the database: %s. Verifying to make sure it is the correct album' % snatched['Title'])
verify(snatched['AlbumID'], folder, snatched['Kind'])
continue
# Try to parse the folder name into a valid format
# TODO: Add metadata lookup
# Attempt 2a: parse the folder name into a valid format
try:
logger.debug('Attempting to extract name, album and year from folder name')
name, album, year = helpers.extract_data(folder_basename)
except:
except Exception as e:
name = None
# Attempt 2b: deduce meta data into a valid format
if name is None:
try:
logger.debug('Attempting to extract name, album and year from metadata')
name, album, year = helpers.extract_metadata(folder)
except Exception as e:
name = None
if name and album and year:
release = myDB.action('SELECT AlbumID, ArtistName, AlbumTitle from albums WHERE ArtistName LIKE ? and AlbumTitle LIKE ?', [name, album]).fetchone()
if release:
logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle']))
@@ -992,30 +1001,28 @@ def forcePostProcess():
rgid = mb.findAlbumID(helpers.latinToAscii(name), helpers.latinToAscii(album))
except:
logger.error('Can not get release information for this album')
continue
if rgid:
verify(rgid, folder)
continue
else:
logger.info('No match found on MusicBrainz for: %s - %s' % (name, album))
continue
else:
try:
possible_rgid = folder_basename[-36:]
# re pattern match: [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}
rgid = uuid.UUID(possible_rgid)
except:
logger.info("Couldn't parse " + folder_basename + " into any valid format. If adding albums from another source, they must be in an 'Artist - Album [Year]' format, or end with the musicbrainz release group id")
continue
if rgid:
rgid = possible_rgid
release = myDB.action('SELECT ArtistName, AlbumTitle, AlbumID from albums WHERE AlbumID=?', [rgid]).fetchone()
if release:
logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle']))
verify(release['AlbumID'], folder, forced=True)
else:
logger.info('Found a (possibly) valid Musicbrainz identifier in album folder name - continuing post-processing')
verify(rgid, folder, forced=True)
# Attempt 3: strip release group id from filename
try:
logger.debug('Attempting to extract release group from folder name')
possible_rgid = folder_basename[-36:]
# re pattern match: [0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}
rgid = uuid.UUID(possible_rgid)
except:
logger.info("Couldn't parse " + folder_basename + " into any valid format. If adding albums from another source, they must be in an 'Artist - Album [Year]' format, or end with the musicbrainz release group id")
rgid = None
if rgid:
rgid = possible_rgid
release = myDB.action('SELECT ArtistName, AlbumTitle, AlbumID from albums WHERE AlbumID=?', [rgid]).fetchone()
if release:
logger.info('Found a match in the database: %s - %s. Verifying to make sure it is the correct album' % (release['ArtistName'], release['AlbumTitle']))
verify(release['AlbumID'], folder, forced=True)
else:
logger.info('Found a (possibly) valid Musicbrainz identifier in album folder name - continuing post-processing')
verify(rgid, folder, forced=True)