helpers: Replace cleanName() implementation to get much higher match ratio.

Track matching is performed using 'CleanName' which up to now was obtained in
convoluted way, which effectively removed any non-ascii alphanumeric characters
but at the same time left some trash preventing the names to be matched due to
whitespace differences. Current implementation performs most of the
transliteration using Unicode NFD decomposition to remove diacritical marks from
characters in Latin scripts, leaving the others intact. Only alphanumeric chars
are included in resulting string and all the spaces are coalesced. Based on
observations on several-tens GiB library, this allows for much better ratio of
automatic track matches.
This commit is contained in:
Andrzej Ciarkowski
2016-02-27 23:31:16 +01:00
parent 7e9bd432ce
commit fd8fb4529c
6 changed files with 169 additions and 26 deletions

View File

@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# This file is part of Headphones.
#
# Headphones is free software: you can redistribute it and/or modify
@@ -219,12 +220,104 @@ def replace_illegal_chars(string, type="file"):
return string
def cleanName(string):
pass1 = latinToAscii(string).lower()
out_string = re.sub('[\.\-\/\!\@\#\$\%\^\&\*\(\)\+\-\"\'\,\;\:\[\]\{\}\<\>\=\_]', '',
pass1).encode('utf-8')
_CN_RE1 = re.compile(ur'[^\w]+', re.UNICODE)
_CN_RE2 = re.compile(ur'[\s_]+', re.UNICODE)
return out_string
_XLATE_GRAPHICAL_AND_DIACRITICAL = {
# Translation table.
# Covers the following letters, for which NFD fails because of lack of
# combining character:
# ©ª«®²³¹»¼½¾ÆÐØÞßæðøþĐđĦħıIJijĸĿŀŁłŒœŦŧDŽDždžLJLjljNJNjnjǤǥDZDzdzȤȥ. This
# includes also some graphical symbols which can be easily replaced and
# usually are written by people who don't have appropriate keyboard layout.
u'©': '(C)', u'ª': 'a.', u'«': '<<', u'®': '(R)', u'²': '2', u'³': '3',
u'¹': '1', u'»': '>>', u'¼': ' 1/4 ', u'½': ' 1/2 ', u'¾': ' 3/4 ',
u'Æ': 'AE', u'Ð': 'D', u'Ø': 'O', u'Þ': 'Th', u'ß': 'ss', u'æ': 'ae',
u'ð': 'd', u'ø': 'o', u'þ': 'th', u'Đ': 'D', u'đ': 'd', u'Ħ': 'H',
u'ħ': 'h', u'ı': 'i', u'IJ': 'IJ', u'ij': 'ij', u'ĸ': 'q', u'Ŀ': 'L',
u'ŀ': 'l', u'Ł': 'L', u'ł': 'l', u'Œ': 'OE', u'œ': 'oe', u'Ŧ': 'T',
u'ŧ': 't', u'DŽ': 'DZ', u'Dž': 'Dz', u'LJ': 'LJ', u'Lj': 'Lj',
u'lj': 'lj', u'NJ': 'NJ', u'Nj': 'Nj', u'nj': 'nj',
u'Ǥ': 'G', u'ǥ': 'g', u'DZ': 'DZ', u'Dz': 'Dz', u'dz': 'dz',
u'Ȥ': 'Z', u'ȥ': 'z', u'': 'No.',
u'º': 'o.', # normalize Nº abbrev (popular w/ classical music),
# this is 'masculine ordering indicator', not degree
}
_XLATE_SPECIAL = {
# Translation table.
# Cover additional special characters processing normalization.
u"'": '', # replace apostrophe with nothing
u'&': ' and ', # expand & to ' and '
}
def _translate(s, dictionary):
# type: (basestring,Mapping[basestring,basestring])->basestring
return ''.join(dictionary.get(x, x) for x in s)
_COMBINING_RANGES = (
(0x0300, 0x036f), # Combining Diacritical Marks
(0x1ab0, 0x1aff), # Combining Diacritical Marks Extended
(0x20d0, 0x20ff), # Combining Diacritical Marks for Symbols
(0x1dc0, 0x1dff) # Combining Diacritical Marks Supplement
)
def _is_unicode_combining(u):
# type: (unicode)->bool
"""
Check if input unicode is combining diacritical mark.
"""
i = ord(u)
for r in _COMBINING_RANGES:
if r[0] <= i <= r[1]:
return True
return False
def _transliterate(u, xlate):
# type: (unicode)->unicode
"""
Perform transliteration using the specified dictionary
"""
u = unicodedata.normalize('NFD', u)
u = u''.join([u'' if _is_unicode_combining(x) else x for x in u])
u = _translate(u, xlate)
# at this point output is either unicode, or plain ascii
return unicode(u)
def clean_name(s):
# type: (basestring)->unicode
"""Remove non-alphanumeric characters from the string, perform
normalization and substitution of some special characters; coalesce spaces.
:param s: string to clean up, possibly unicode one.
:return: cleaned-up version of input string.
"""
if not isinstance(s, unicode):
# ignore extended chars if someone was dumb enough to pass non-ascii
# narrow string here, use only unicode for meaningful texts
u = unicode(s, 'ascii', 'replace')
else:
u = s
# 1. don't bother doing normalization NFKC, rather transliterate
# using special translation table
u = _transliterate(u, _XLATE_GRAPHICAL_AND_DIACRITICAL)
# 2. normalize NFKC the result
u = unicodedata.normalize('NFKC', u)
# 3. translate spacials
u = _translate(u, _XLATE_SPECIAL)
# 4. replace any non-alphanumeric character sequences by spaces
u = _CN_RE1.sub(u' ', u)
# 5. coalesce interleaved space/underscore sequences
u = _CN_RE2.sub(u' ', u)
# 6. trim
u = u.strip()
# 7. lowercase
return u
def cleanTitle(title):

View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from unittestcompat import TestCase
from headphones.helpers import clean_name
class HelpersTest(TestCase):
def test_clean_name(self):
"""helpers: check correctness of clean_name() function"""
cases = {
u' Weiße & rose ': 'Weisse and rose',
u'Multiple / spaces': 'Multiple spaces',
u'Kevin\'s m²': 'Kevins m2',
u'Symphonęy Nº9': 'Symphoney No.9',
u'ÆæßðÞIJij': u'AeaessdThIJıj',
u'Obsessió (Cerebral Apoplexy remix)': 'obsessio cerebral '
'apoplexy remix',
u'Doktór Hałabała i siedmiu zbojów': 'doktor halabala i siedmiu '
'zbojow',
u'Arbetets Söner och Döttrar': 'arbetets soner och dottrar',
u'Björk Guðmundsdóttir': 'bjork gudmundsdottir',
u'L\'Arc~en~Ciel': 'larc en ciel',
u'Orquesta de la Luz (オルケスタ・デ・ラ・ルス)':
u'Orquesta de la Luz オルケスタ デ ラ ルス'
}
for first, second in cases.iteritems():
nf = clean_name(first).lower()
ns = clean_name(second).lower()
self.assertEqual(
nf, ns, u"check cleaning of case (%s,"
u"%s)" % (nf, ns)
)
def test_clean_name_nonunicode(self):
"""helpers: check if clean_name() works on non-unicode input"""
input = 'foo $ bar/BAZ'
test = clean_name(input).lower()
expected = 'foo bar baz'
self.assertEqual(
test, expected, "check clean_name() works on non-unicode"
)
input = 'fóó $ BAZ'
test = clean_name(input).lower()
expected = clean_name('%fóó baz ').lower()
self.assertEqual(
test, expected, "check clean_name() with narrow non-ascii input"
)

View File

@@ -374,7 +374,7 @@ def addArtisttoDB(artistid, extrasonly=False, forcefull=False, type="artist"):
for track in hybridrelease['Tracks']:
cleanname = helpers.cleanName(
cleanname = helpers.clean_name(
artist['artist_name'] + ' ' + rg['title'] + ' ' + track['title'])
controlValueDict = {"TrackID": track['id'],
@@ -710,7 +710,7 @@ def addReleaseById(rid, rgid=None):
myDB.action('INSERT INTO releases VALUES( ?, ?)', [rid, release_dict['rgid']])
for track in release_dict['tracks']:
cleanname = helpers.cleanName(
cleanname = helpers.clean_name(
release_dict['artist_name'] + ' ' + release_dict['rg_title'] + ' ' + track['title'])
controlValueDict = {"TrackID": track['id'],

View File

@@ -138,7 +138,7 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
# TODO: skip adding songs without the minimum requisite information (just a matter of putting together the right if statements)
if f_artist and f.album and f.title:
CleanName = helpers.cleanName(f_artist + ' ' + f.album + ' ' + f.title)
CleanName = helpers.clean_name(f_artist + ' ' + f.album + ' ' + f.title)
else:
CleanName = None
@@ -332,15 +332,15 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
# There was a bug where artists with special characters (-,') would show up in new artists.
artist_list = [
x for x in unique_artists
if helpers.cleanName(x).lower() not in [
helpers.cleanName(y[0]).lower()
if helpers.clean_name(x).lower() not in [
helpers.clean_name(y[0]).lower()
for y in current_artists
]
]
artists_checked = [
x for x in unique_artists
if helpers.cleanName(x).lower() in [
helpers.cleanName(y[0]).lower()
if helpers.clean_name(x).lower() in [
helpers.clean_name(y[0]).lower()
for y in current_artists
]
]

View File

@@ -637,7 +637,7 @@ def get_new_releases(rgid, includeExtras=False, forcefull=False):
for track in release['Tracks']:
cleanname = helpers.cleanName(
cleanname = helpers.clean_name(
release['ArtistName'] + ' ' + release['AlbumTitle'] + ' ' + track['title'])
controlValueDict = {"TrackID": track['id'],

View File

@@ -29,7 +29,7 @@ import urllib2
import os
import re
from headphones import logger, searcher, db, importer, mb, lastfm, librarysync, helpers, notifiers
from headphones.helpers import checked, radio, today, cleanName
from headphones.helpers import checked, radio, today, clean_name
from mako.lookup import TemplateLookup
from mako import exceptions
import headphones
@@ -577,7 +577,7 @@ class WebInterface(object):
for albums in have_albums:
# Have to skip over manually matched tracks
if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
original_clean = helpers.cleanName(
original_clean = helpers.clean_name(
albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
# else:
# original_clean = None
@@ -595,10 +595,12 @@ class WebInterface(object):
# unmatchedalbums = [f for f in have_album_dictionary if f not in [x for x in headphones_album_dictionary]]
check = set(
[(cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) for d in
[(clean_name(d['ArtistName']).lower(),
clean_name(d['AlbumTitle']).lower()) for d in
headphones_album_dictionary])
unmatchedalbums = [d for d in have_album_dictionary if (
cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) not in check]
clean_name(d['ArtistName']).lower(),
clean_name(d['AlbumTitle']).lower()) not in check]
return serve_template(templatename="manageunmatched.html", title="Manage Unmatched Items",
unmatchedalbums=unmatchedalbums)
@@ -622,8 +624,8 @@ class WebInterface(object):
(artist, album))
elif action == "matchArtist":
existing_artist_clean = helpers.cleanName(existing_artist).lower()
new_artist_clean = helpers.cleanName(new_artist).lower()
existing_artist_clean = helpers.clean_name(existing_artist).lower()
new_artist_clean = helpers.clean_name(new_artist).lower()
if new_artist_clean != existing_artist_clean:
have_tracks = myDB.action(
'SELECT Matched, CleanName, Location, BitRate, Format FROM have WHERE ArtistName=?',
@@ -668,10 +670,10 @@ class WebInterface(object):
"Artist %s already named appropriately; nothing to modify" % existing_artist)
elif action == "matchAlbum":
existing_artist_clean = helpers.cleanName(existing_artist).lower()
new_artist_clean = helpers.cleanName(new_artist).lower()
existing_album_clean = helpers.cleanName(existing_album).lower()
new_album_clean = helpers.cleanName(new_album).lower()
existing_artist_clean = helpers.clean_name(existing_artist).lower()
new_artist_clean = helpers.clean_name(new_artist).lower()
existing_album_clean = helpers.clean_name(existing_album).lower()
new_album_clean = helpers.clean_name(new_album).lower()
existing_clean_string = existing_artist_clean + " " + existing_album_clean
new_clean_string = new_artist_clean + " " + new_album_clean
if existing_clean_string != new_clean_string:
@@ -728,7 +730,7 @@ class WebInterface(object):
'SELECT ArtistName, AlbumTitle, TrackTitle, CleanName, Matched from have')
for albums in manualalbums:
if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
original_clean = helpers.cleanName(
original_clean = helpers.clean_name(
albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
if albums['Matched'] == "Ignored" or albums['Matched'] == "Manual" or albums[
'CleanName'] != original_clean:
@@ -769,7 +771,7 @@ class WebInterface(object):
[artist])
update_count = 0
for tracks in update_clean:
original_clean = helpers.cleanName(
original_clean = helpers.clean_name(
tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
'TrackTitle']).lower()
album = tracks['AlbumTitle']
@@ -797,7 +799,7 @@ class WebInterface(object):
(artist, album))
update_count = 0
for tracks in update_clean:
original_clean = helpers.cleanName(
original_clean = helpers.clean_name(
tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
'TrackTitle']).lower()
track_title = tracks['TrackTitle']