helpers: Replace cleanName() implementation to get much higher match ratio.

Track matching is performed using 'CleanName' which up to now was obtained in convoluted way, which effectively removed any non-ascii alphanumeric characters but at the same time left some trash preventing the names to be matched due to whitespace differences. Current implementation performs most of the transliteration using Unicode NFD decomposition to remove diacritical marks from characters in Latin scripts, leaving the others intact. Only alphanumeric chars are included in resulting string and all the spaces are coalesced. Based on observations on several-tens GiB library, this allows for much better ratio of automatic track matches.
2026-04-12 07:59:26 +01:00 · 2016-02-27 23:31:16 +01:00
parent 7e9bd432ce
commit fd8fb4529c
6 changed files with 169 additions and 26 deletions
--- a/headphones/helpers.py
+++ b/headphones/helpers.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #  This file is part of Headphones.
 #
 #  Headphones is free software: you can redistribute it and/or modify
@@ -219,12 +220,104 @@ def replace_illegal_chars(string, type="file"):
    return string


-def cleanName(string):
-    pass1 = latinToAscii(string).lower()
-    out_string = re.sub('[\.\-\/\!\@\#\$\%\^\&\*\(\)\+\-\"\'\,\;\:\[\]\{\}\<\>\=\_]', '',
-                        pass1).encode('utf-8')
+_CN_RE1 = re.compile(ur'[^\w]+', re.UNICODE)
+_CN_RE2 = re.compile(ur'[\s_]+', re.UNICODE)

-    return out_string
+
+_XLATE_GRAPHICAL_AND_DIACRITICAL = {
+    # Translation table.
+    # Covers the following letters, for which NFD fails because of lack of
+    # combining character:
+    # ©ª«®²³¹»¼½¾ÆÐØÞßæðøþĐđĦħıĲĳĸĿŀŁłŒœŦŧǄǅǆǇǈǉǊǋǌǤǥǱǲǳȤȥ. This
+    # includes also some graphical symbols which can be easily replaced and
+    # usually are written by people who don't have appropriate keyboard layout.
+    u'©': '(C)', u'ª': 'a.', u'«': '<<', u'®': '(R)', u'²': '2', u'³': '3',
+    u'¹': '1', u'»': '>>', u'¼': ' 1/4 ', u'½': ' 1/2 ', u'¾': ' 3/4 ',
+    u'Æ': 'AE', u'Ð': 'D', u'Ø': 'O', u'Þ': 'Th', u'ß': 'ss', u'æ': 'ae',
+    u'ð': 'd', u'ø': 'o', u'þ': 'th', u'Đ': 'D', u'đ': 'd', u'Ħ': 'H',
+    u'ħ': 'h', u'ı': 'i', u'Ĳ': 'IJ', u'ĳ': 'ij', u'ĸ': 'q', u'Ŀ': 'L',
+    u'ŀ': 'l', u'Ł': 'L', u'ł': 'l', u'Œ': 'OE', u'œ': 'oe', u'Ŧ': 'T',
+    u'ŧ': 't', u'Ǆ': 'DZ', u'ǅ': 'Dz', u'Ǉ': 'LJ', u'ǈ': 'Lj',
+    u'ǉ': 'lj', u'Ǌ': 'NJ', u'ǋ': 'Nj', u'ǌ': 'nj',
+    u'Ǥ': 'G', u'ǥ': 'g', u'Ǳ': 'DZ', u'ǲ': 'Dz', u'ǳ': 'dz',
+    u'Ȥ': 'Z', u'ȥ': 'z', u'№': 'No.',
+    u'º': 'o.',        # normalize Nº abbrev (popular w/ classical music),
+                       # this is 'masculine ordering indicator', not degree
+}
+
+_XLATE_SPECIAL = {
+    # Translation table.
+    # Cover additional special characters processing normalization.
+    u"'": '',         # replace apostrophe with nothing
+    u'&': ' and ',     # expand & to ' and '
+}
+
+
+def _translate(s, dictionary):
+    # type: (basestring,Mapping[basestring,basestring])->basestring
+    return ''.join(dictionary.get(x, x) for x in s)
+
+
+_COMBINING_RANGES = (
+    (0x0300, 0x036f),   # Combining Diacritical Marks
+    (0x1ab0, 0x1aff),   # Combining Diacritical Marks Extended
+    (0x20d0, 0x20ff),   # Combining Diacritical Marks for Symbols
+    (0x1dc0, 0x1dff)    # Combining Diacritical Marks Supplement
+)
+
+
+def _is_unicode_combining(u):
+    # type: (unicode)->bool
+    """
+    Check if input unicode is combining diacritical mark.
+    """
+    i = ord(u)
+    for r in _COMBINING_RANGES:
+        if r[0] <= i <= r[1]:
+            return True
+    return False
+
+
+def _transliterate(u, xlate):
+    # type: (unicode)->unicode
+    """
+    Perform transliteration using the specified dictionary
+    """
+    u = unicodedata.normalize('NFD', u)
+    u = u''.join([u'' if _is_unicode_combining(x) else x for x in u])
+    u = _translate(u, xlate)
+    # at this point output is either unicode, or plain ascii
+    return unicode(u)
+
+
+def clean_name(s):
+    # type: (basestring)->unicode
+    """Remove non-alphanumeric characters from the string, perform
+    normalization and substitution of some special characters; coalesce spaces.
+    :param s: string to clean up, possibly unicode one.
+    :return: cleaned-up version of input string.
+    """
+    if not isinstance(s, unicode):
+        # ignore extended chars if someone was dumb enough to pass non-ascii
+        # narrow string here, use only unicode for meaningful texts
+        u = unicode(s, 'ascii', 'replace')
+    else:
+        u = s
+    # 1. don't bother doing normalization NFKC, rather transliterate
+    # using special translation table
+    u = _transliterate(u, _XLATE_GRAPHICAL_AND_DIACRITICAL)
+    # 2. normalize NFKC the result
+    u = unicodedata.normalize('NFKC', u)
+    # 3. translate spacials
+    u = _translate(u, _XLATE_SPECIAL)
+    # 4. replace any non-alphanumeric character sequences by spaces
+    u = _CN_RE1.sub(u' ', u)
+    # 5. coalesce interleaved space/underscore sequences
+    u = _CN_RE2.sub(u' ', u)
+    # 6. trim
+    u = u.strip()
+    # 7. lowercase
+    return u


 def cleanTitle(title):
--- a/headphones/helpers_test.py
+++ b/headphones/helpers_test.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from unittestcompat import TestCase
+from headphones.helpers import clean_name
+
+
+class HelpersTest(TestCase):
+
+    def test_clean_name(self):
+        """helpers: check correctness of clean_name() function"""
+        cases = {
+            u' Weiße & rose ': 'Weisse and rose',
+            u'Multiple / spaces': 'Multiple spaces',
+            u'Kevin\'s m²': 'Kevins m2',
+            u'Symphonęy Nº9': 'Symphoney No.9',
+            u'ÆæßðÞĲĳ': u'AeaessdThIJıj',
+            u'Obsessió (Cerebral Apoplexy remix)': 'obsessio cerebral '
+                                                    'apoplexy remix',
+            u'Doktór Hałabała i siedmiu zbojów': 'doktor halabala i siedmiu '
+                                                 'zbojow',
+            u'Arbetets Söner och Döttrar': 'arbetets soner och dottrar',
+            u'Björk Guðmundsdóttir': 'bjork gudmundsdottir',
+            u'L\'Arc~en~Ciel': 'larc en ciel',
+            u'Orquesta de la Luz (オルケスタ・デ・ラ・ルス)':
+                u'Orquesta de la Luz オルケスタ デ ラ ルス'
+
+        }
+        for first, second in cases.iteritems():
+            nf = clean_name(first).lower()
+            ns = clean_name(second).lower()
+            self.assertEqual(
+                nf, ns, u"check cleaning of case (%s,"
+                        u"%s)" % (nf, ns)
+            )
+
+    def test_clean_name_nonunicode(self):
+        """helpers: check if clean_name() works on non-unicode input"""
+        input = 'foo $ bar/BAZ'
+        test = clean_name(input).lower()
+        expected = 'foo bar baz'
+        self.assertEqual(
+            test, expected, "check clean_name() works on non-unicode"
+        )
+        input = 'fóó $ BAZ'
+        test = clean_name(input).lower()
+        expected = clean_name('%fóó baz ').lower()
+        self.assertEqual(
+            test, expected, "check clean_name() with narrow non-ascii input"
+        )
--- a/headphones/importer.py
+++ b/headphones/importer.py
@@ -374,7 +374,7 @@ def addArtisttoDB(artistid, extrasonly=False, forcefull=False, type="artist"):

            for track in hybridrelease['Tracks']:

-                cleanname = helpers.cleanName(
+                cleanname = helpers.clean_name(
                    artist['artist_name'] + ' ' + rg['title'] + ' ' + track['title'])

                controlValueDict = {"TrackID": track['id'],
@@ -710,7 +710,7 @@ def addReleaseById(rid, rgid=None):
        myDB.action('INSERT INTO releases VALUES( ?, ?)', [rid, release_dict['rgid']])

        for track in release_dict['tracks']:
-            cleanname = helpers.cleanName(
+            cleanname = helpers.clean_name(
                release_dict['artist_name'] + ' ' + release_dict['rg_title'] + ' ' + track['title'])

            controlValueDict = {"TrackID": track['id'],
--- a/headphones/librarysync.py
+++ b/headphones/librarysync.py
@@ -138,7 +138,7 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
                # TODO: skip adding songs without the minimum requisite information (just a matter of putting together the right if statements)

                if f_artist and f.album and f.title:
-                    CleanName = helpers.cleanName(f_artist + ' ' + f.album + ' ' + f.title)
+                    CleanName = helpers.clean_name(f_artist + ' ' + f.album + ' ' + f.title)
                else:
                    CleanName = None

@@ -332,15 +332,15 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
        # There was a bug where artists with special characters (-,') would show up in new artists.
        artist_list = [
            x for x in unique_artists
-            if helpers.cleanName(x).lower() not in [
-                helpers.cleanName(y[0]).lower()
+            if helpers.clean_name(x).lower() not in [
+                helpers.clean_name(y[0]).lower()
                for y in current_artists
                ]
            ]
        artists_checked = [
            x for x in unique_artists
-            if helpers.cleanName(x).lower() in [
-                helpers.cleanName(y[0]).lower()
+            if helpers.clean_name(x).lower() in [
+                helpers.clean_name(y[0]).lower()
                for y in current_artists
                ]
            ]
--- a/headphones/mb.py
+++ b/headphones/mb.py
@@ -637,7 +637,7 @@ def get_new_releases(rgid, includeExtras=False, forcefull=False):

            for track in release['Tracks']:

-                cleanname = helpers.cleanName(
+                cleanname = helpers.clean_name(
                    release['ArtistName'] + ' ' + release['AlbumTitle'] + ' ' + track['title'])

                controlValueDict = {"TrackID": track['id'],
--- a/headphones/webserve.py
+++ b/headphones/webserve.py
@@ -29,7 +29,7 @@ import urllib2
 import os
 import re
 from headphones import logger, searcher, db, importer, mb, lastfm, librarysync, helpers, notifiers
-from headphones.helpers import checked, radio, today, cleanName
+from headphones.helpers import checked, radio, today, clean_name
 from mako.lookup import TemplateLookup
 from mako import exceptions
 import headphones
@@ -577,7 +577,7 @@ class WebInterface(object):
        for albums in have_albums:
            # Have to skip over manually matched tracks
            if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
                # else:
                #     original_clean = None
@@ -595,10 +595,12 @@ class WebInterface(object):
        # unmatchedalbums = [f for f in have_album_dictionary if f not in [x for x in headphones_album_dictionary]]

        check = set(
-            [(cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) for d in
+            [(clean_name(d['ArtistName']).lower(),
+              clean_name(d['AlbumTitle']).lower()) for d in
             headphones_album_dictionary])
        unmatchedalbums = [d for d in have_album_dictionary if (
-        cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) not in check]
+            clean_name(d['ArtistName']).lower(),
+            clean_name(d['AlbumTitle']).lower()) not in check]

        return serve_template(templatename="manageunmatched.html", title="Manage Unmatched Items",
                              unmatchedalbums=unmatchedalbums)
@@ -622,8 +624,8 @@ class WebInterface(object):
                (artist, album))

        elif action == "matchArtist":
-            existing_artist_clean = helpers.cleanName(existing_artist).lower()
-            new_artist_clean = helpers.cleanName(new_artist).lower()
+            existing_artist_clean = helpers.clean_name(existing_artist).lower()
+            new_artist_clean = helpers.clean_name(new_artist).lower()
            if new_artist_clean != existing_artist_clean:
                have_tracks = myDB.action(
                    'SELECT Matched, CleanName, Location, BitRate, Format FROM have WHERE ArtistName=?',
@@ -668,10 +670,10 @@ class WebInterface(object):
                    "Artist %s already named appropriately; nothing to modify" % existing_artist)

        elif action == "matchAlbum":
-            existing_artist_clean = helpers.cleanName(existing_artist).lower()
-            new_artist_clean = helpers.cleanName(new_artist).lower()
-            existing_album_clean = helpers.cleanName(existing_album).lower()
-            new_album_clean = helpers.cleanName(new_album).lower()
+            existing_artist_clean = helpers.clean_name(existing_artist).lower()
+            new_artist_clean = helpers.clean_name(new_artist).lower()
+            existing_album_clean = helpers.clean_name(existing_album).lower()
+            new_album_clean = helpers.clean_name(new_album).lower()
            existing_clean_string = existing_artist_clean + " " + existing_album_clean
            new_clean_string = new_artist_clean + " " + new_album_clean
            if existing_clean_string != new_clean_string:
@@ -728,7 +730,7 @@ class WebInterface(object):
            'SELECT ArtistName, AlbumTitle, TrackTitle, CleanName, Matched from have')
        for albums in manualalbums:
            if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
                if albums['Matched'] == "Ignored" or albums['Matched'] == "Manual" or albums[
                    'CleanName'] != original_clean:
@@ -769,7 +771,7 @@ class WebInterface(object):
                [artist])
            update_count = 0
            for tracks in update_clean:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
                        'TrackTitle']).lower()
                album = tracks['AlbumTitle']
@@ -797,7 +799,7 @@ class WebInterface(object):
                (artist, album))
            update_count = 0
            for tracks in update_clean:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
                        'TrackTitle']).lower()
                track_title = tracks['TrackTitle']