Merge remote-tracking branch 'andrzejc/clean-name' into develop

2026-04-18 19:09:28 +01:00 · 2016-04-05 12:04:44 +01:00
parent db3df7e6c9 fd8fb4529c
commit b894ebb3c5
6 changed files with 169 additions and 26 deletions
--- a/headphones/helpers.py
+++ b/headphones/helpers.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #  This file is part of Headphones.
 #
 #  Headphones is free software: you can redistribute it and/or modify
@@ -219,12 +220,104 @@ def replace_illegal_chars(string, type="file"):
    return string


-def cleanName(string):
-    pass1 = latinToAscii(string).lower()
-    out_string = re.sub('[\.\-\/\!\@\#\$\%\^\&\*\(\)\+\-\"\'\,\;\:\[\]\{\}\<\>\=\_]', '',
-                        pass1).encode('utf-8')
+_CN_RE1 = re.compile(ur'[^\w]+', re.UNICODE)
+_CN_RE2 = re.compile(ur'[\s_]+', re.UNICODE)

-    return out_string
+
+_XLATE_GRAPHICAL_AND_DIACRITICAL = {
+    # Translation table.
+    # Covers the following letters, for which NFD fails because of lack of
+    # combining character:
+    # ©ª«®²³¹»¼½¾ÆÐØÞßæðøþĐđĦħıĲĳĸĿŀŁłŒœŦŧǄǅǆǇǈǉǊǋǌǤǥǱǲǳȤȥ. This
+    # includes also some graphical symbols which can be easily replaced and
+    # usually are written by people who don't have appropriate keyboard layout.
+    u'©': '(C)', u'ª': 'a.', u'«': '<<', u'®': '(R)', u'²': '2', u'³': '3',
+    u'¹': '1', u'»': '>>', u'¼': ' 1/4 ', u'½': ' 1/2 ', u'¾': ' 3/4 ',
+    u'Æ': 'AE', u'Ð': 'D', u'Ø': 'O', u'Þ': 'Th', u'ß': 'ss', u'æ': 'ae',
+    u'ð': 'd', u'ø': 'o', u'þ': 'th', u'Đ': 'D', u'đ': 'd', u'Ħ': 'H',
+    u'ħ': 'h', u'ı': 'i', u'Ĳ': 'IJ', u'ĳ': 'ij', u'ĸ': 'q', u'Ŀ': 'L',
+    u'ŀ': 'l', u'Ł': 'L', u'ł': 'l', u'Œ': 'OE', u'œ': 'oe', u'Ŧ': 'T',
+    u'ŧ': 't', u'Ǆ': 'DZ', u'ǅ': 'Dz', u'Ǉ': 'LJ', u'ǈ': 'Lj',
+    u'ǉ': 'lj', u'Ǌ': 'NJ', u'ǋ': 'Nj', u'ǌ': 'nj',
+    u'Ǥ': 'G', u'ǥ': 'g', u'Ǳ': 'DZ', u'ǲ': 'Dz', u'ǳ': 'dz',
+    u'Ȥ': 'Z', u'ȥ': 'z', u'№': 'No.',
+    u'º': 'o.',        # normalize Nº abbrev (popular w/ classical music),
+                       # this is 'masculine ordering indicator', not degree
+}
+
+_XLATE_SPECIAL = {
+    # Translation table.
+    # Cover additional special characters processing normalization.
+    u"'": '',         # replace apostrophe with nothing
+    u'&': ' and ',     # expand & to ' and '
+}
+
+
+def _translate(s, dictionary):
+    # type: (basestring,Mapping[basestring,basestring])->basestring
+    return ''.join(dictionary.get(x, x) for x in s)
+
+
+_COMBINING_RANGES = (
+    (0x0300, 0x036f),   # Combining Diacritical Marks
+    (0x1ab0, 0x1aff),   # Combining Diacritical Marks Extended
+    (0x20d0, 0x20ff),   # Combining Diacritical Marks for Symbols
+    (0x1dc0, 0x1dff)    # Combining Diacritical Marks Supplement
+)
+
+
+def _is_unicode_combining(u):
+    # type: (unicode)->bool
+    """
+    Check if input unicode is combining diacritical mark.
+    """
+    i = ord(u)
+    for r in _COMBINING_RANGES:
+        if r[0] <= i <= r[1]:
+            return True
+    return False
+
+
+def _transliterate(u, xlate):
+    # type: (unicode)->unicode
+    """
+    Perform transliteration using the specified dictionary
+    """
+    u = unicodedata.normalize('NFD', u)
+    u = u''.join([u'' if _is_unicode_combining(x) else x for x in u])
+    u = _translate(u, xlate)
+    # at this point output is either unicode, or plain ascii
+    return unicode(u)
+
+
+def clean_name(s):
+    # type: (basestring)->unicode
+    """Remove non-alphanumeric characters from the string, perform
+    normalization and substitution of some special characters; coalesce spaces.
+    :param s: string to clean up, possibly unicode one.
+    :return: cleaned-up version of input string.
+    """
+    if not isinstance(s, unicode):
+        # ignore extended chars if someone was dumb enough to pass non-ascii
+        # narrow string here, use only unicode for meaningful texts
+        u = unicode(s, 'ascii', 'replace')
+    else:
+        u = s
+    # 1. don't bother doing normalization NFKC, rather transliterate
+    # using special translation table
+    u = _transliterate(u, _XLATE_GRAPHICAL_AND_DIACRITICAL)
+    # 2. normalize NFKC the result
+    u = unicodedata.normalize('NFKC', u)
+    # 3. translate spacials
+    u = _translate(u, _XLATE_SPECIAL)
+    # 4. replace any non-alphanumeric character sequences by spaces
+    u = _CN_RE1.sub(u' ', u)
+    # 5. coalesce interleaved space/underscore sequences
+    u = _CN_RE2.sub(u' ', u)
+    # 6. trim
+    u = u.strip()
+    # 7. lowercase
+    return u


 def cleanTitle(title):
--- a/headphones/helpers_test.py
+++ b/headphones/helpers_test.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from unittestcompat import TestCase
+from headphones.helpers import clean_name
+
+
+class HelpersTest(TestCase):
+
+    def test_clean_name(self):
+        """helpers: check correctness of clean_name() function"""
+        cases = {
+            u' Weiße & rose ': 'Weisse and rose',
+            u'Multiple / spaces': 'Multiple spaces',
+            u'Kevin\'s m²': 'Kevins m2',
+            u'Symphonęy Nº9': 'Symphoney No.9',
+            u'ÆæßðÞĲĳ': u'AeaessdThIJıj',
+            u'Obsessió (Cerebral Apoplexy remix)': 'obsessio cerebral '
+                                                    'apoplexy remix',
+            u'Doktór Hałabała i siedmiu zbojów': 'doktor halabala i siedmiu '
+                                                 'zbojow',
+            u'Arbetets Söner och Döttrar': 'arbetets soner och dottrar',
+            u'Björk Guðmundsdóttir': 'bjork gudmundsdottir',
+            u'L\'Arc~en~Ciel': 'larc en ciel',
+            u'Orquesta de la Luz (オルケスタ・デ・ラ・ルス)':
+                u'Orquesta de la Luz オルケスタ デ ラ ルス'
+
+        }
+        for first, second in cases.iteritems():
+            nf = clean_name(first).lower()
+            ns = clean_name(second).lower()
+            self.assertEqual(
+                nf, ns, u"check cleaning of case (%s,"
+                        u"%s)" % (nf, ns)
+            )
+
+    def test_clean_name_nonunicode(self):
+        """helpers: check if clean_name() works on non-unicode input"""
+        input = 'foo $ bar/BAZ'
+        test = clean_name(input).lower()
+        expected = 'foo bar baz'
+        self.assertEqual(
+            test, expected, "check clean_name() works on non-unicode"
+        )
+        input = 'fóó $ BAZ'
+        test = clean_name(input).lower()
+        expected = clean_name('%fóó baz ').lower()
+        self.assertEqual(
+            test, expected, "check clean_name() with narrow non-ascii input"
+        )
--- a/headphones/importer.py
+++ b/headphones/importer.py
@@ -374,7 +374,7 @@ def addArtisttoDB(artistid, extrasonly=False, forcefull=False, type="artist"):

            for track in hybridrelease['Tracks']:

-                cleanname = helpers.cleanName(
+                cleanname = helpers.clean_name(
                    artist['artist_name'] + ' ' + rg['title'] + ' ' + track['title'])

                controlValueDict = {"TrackID": track['id'],
@@ -710,7 +710,7 @@ def addReleaseById(rid, rgid=None):
        myDB.action('INSERT INTO releases VALUES( ?, ?)', [rid, release_dict['rgid']])

        for track in release_dict['tracks']:
-            cleanname = helpers.cleanName(
+            cleanname = helpers.clean_name(
                release_dict['artist_name'] + ' ' + release_dict['rg_title'] + ' ' + track['title'])

            controlValueDict = {"TrackID": track['id'],
--- a/headphones/librarysync.py
+++ b/headphones/librarysync.py
@@ -138,7 +138,7 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
                # TODO: skip adding songs without the minimum requisite information (just a matter of putting together the right if statements)

                if f_artist and f.album and f.title:
-                    CleanName = helpers.cleanName(f_artist + ' ' + f.album + ' ' + f.title)
+                    CleanName = helpers.clean_name(f_artist + ' ' + f.album + ' ' + f.title)
                else:
                    CleanName = None

@@ -332,15 +332,15 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None,
        # There was a bug where artists with special characters (-,') would show up in new artists.
        artist_list = [
            x for x in unique_artists
-            if helpers.cleanName(x).lower() not in [
-                helpers.cleanName(y[0]).lower()
+            if helpers.clean_name(x).lower() not in [
+                helpers.clean_name(y[0]).lower()
                for y in current_artists
                ]
            ]
        artists_checked = [
            x for x in unique_artists
-            if helpers.cleanName(x).lower() in [
-                helpers.cleanName(y[0]).lower()
+            if helpers.clean_name(x).lower() in [
+                helpers.clean_name(y[0]).lower()
                for y in current_artists
                ]
            ]
--- a/headphones/mb.py
+++ b/headphones/mb.py
@@ -637,7 +637,7 @@ def get_new_releases(rgid, includeExtras=False, forcefull=False):

            for track in release['Tracks']:

-                cleanname = helpers.cleanName(
+                cleanname = helpers.clean_name(
                    release['ArtistName'] + ' ' + release['AlbumTitle'] + ' ' + track['title'])

                controlValueDict = {"TrackID": track['id'],
--- a/headphones/webserve.py
+++ b/headphones/webserve.py
@@ -29,7 +29,7 @@ import urllib2
 import os
 import re
 from headphones import logger, searcher, db, importer, mb, lastfm, librarysync, helpers, notifiers
-from headphones.helpers import checked, radio, today, cleanName
+from headphones.helpers import checked, radio, today, clean_name
 from mako.lookup import TemplateLookup
 from mako import exceptions
 import headphones
@@ -577,7 +577,7 @@ class WebInterface(object):
        for albums in have_albums:
            # Have to skip over manually matched tracks
            if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
                # else:
                #     original_clean = None
@@ -595,10 +595,12 @@ class WebInterface(object):
        # unmatchedalbums = [f for f in have_album_dictionary if f not in [x for x in headphones_album_dictionary]]

        check = set(
-            [(cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) for d in
+            [(clean_name(d['ArtistName']).lower(),
+              clean_name(d['AlbumTitle']).lower()) for d in
             headphones_album_dictionary])
        unmatchedalbums = [d for d in have_album_dictionary if (
-        cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) not in check]
+            clean_name(d['ArtistName']).lower(),
+            clean_name(d['AlbumTitle']).lower()) not in check]

        return serve_template(templatename="manageunmatched.html", title="Manage Unmatched Items",
                              unmatchedalbums=unmatchedalbums)
@@ -622,8 +624,8 @@ class WebInterface(object):
                (artist, album))

        elif action == "matchArtist":
-            existing_artist_clean = helpers.cleanName(existing_artist).lower()
-            new_artist_clean = helpers.cleanName(new_artist).lower()
+            existing_artist_clean = helpers.clean_name(existing_artist).lower()
+            new_artist_clean = helpers.clean_name(new_artist).lower()
            if new_artist_clean != existing_artist_clean:
                have_tracks = myDB.action(
                    'SELECT Matched, CleanName, Location, BitRate, Format FROM have WHERE ArtistName=?',
@@ -668,10 +670,10 @@ class WebInterface(object):
                    "Artist %s already named appropriately; nothing to modify" % existing_artist)

        elif action == "matchAlbum":
-            existing_artist_clean = helpers.cleanName(existing_artist).lower()
-            new_artist_clean = helpers.cleanName(new_artist).lower()
-            existing_album_clean = helpers.cleanName(existing_album).lower()
-            new_album_clean = helpers.cleanName(new_album).lower()
+            existing_artist_clean = helpers.clean_name(existing_artist).lower()
+            new_artist_clean = helpers.clean_name(new_artist).lower()
+            existing_album_clean = helpers.clean_name(existing_album).lower()
+            new_album_clean = helpers.clean_name(new_album).lower()
            existing_clean_string = existing_artist_clean + " " + existing_album_clean
            new_clean_string = new_artist_clean + " " + new_album_clean
            if existing_clean_string != new_clean_string:
@@ -728,7 +730,7 @@ class WebInterface(object):
            'SELECT ArtistName, AlbumTitle, TrackTitle, CleanName, Matched from have')
        for albums in manualalbums:
            if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle'])
                if albums['Matched'] == "Ignored" or albums['Matched'] == "Manual" or albums[
                    'CleanName'] != original_clean:
@@ -769,7 +771,7 @@ class WebInterface(object):
                [artist])
            update_count = 0
            for tracks in update_clean:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
                        'TrackTitle']).lower()
                album = tracks['AlbumTitle']
@@ -797,7 +799,7 @@ class WebInterface(object):
                (artist, album))
            update_count = 0
            for tracks in update_clean:
-                original_clean = helpers.cleanName(
+                original_clean = helpers.clean_name(
                    tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[
                        'TrackTitle']).lower()
                track_title = tracks['TrackTitle']