diff --git a/headphones/helpers.py b/headphones/helpers.py index 5e836178..076468fd 100644 --- a/headphones/helpers.py +++ b/headphones/helpers.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # This file is part of Headphones. # # Headphones is free software: you can redistribute it and/or modify @@ -219,12 +220,104 @@ def replace_illegal_chars(string, type="file"): return string -def cleanName(string): - pass1 = latinToAscii(string).lower() - out_string = re.sub('[\.\-\/\!\@\#\$\%\^\&\*\(\)\+\-\"\'\,\;\:\[\]\{\}\<\>\=\_]', '', - pass1).encode('utf-8') +_CN_RE1 = re.compile(ur'[^\w]+', re.UNICODE) +_CN_RE2 = re.compile(ur'[\s_]+', re.UNICODE) - return out_string + +_XLATE_GRAPHICAL_AND_DIACRITICAL = { + # Translation table. + # Covers the following letters, for which NFD fails because of lack of + # combining character: + # ©ª«®²³¹»¼½¾ÆÐØÞßæðøþĐđĦħıIJijĸĿŀŁłŒœŦŧDŽDždžLJLjljNJNjnjǤǥDZDzdzȤȥ. This + # includes also some graphical symbols which can be easily replaced and + # usually are written by people who don't have appropriate keyboard layout. + u'©': '(C)', u'ª': 'a.', u'«': '<<', u'®': '(R)', u'²': '2', u'³': '3', + u'¹': '1', u'»': '>>', u'¼': ' 1/4 ', u'½': ' 1/2 ', u'¾': ' 3/4 ', + u'Æ': 'AE', u'Ð': 'D', u'Ø': 'O', u'Þ': 'Th', u'ß': 'ss', u'æ': 'ae', + u'ð': 'd', u'ø': 'o', u'þ': 'th', u'Đ': 'D', u'đ': 'd', u'Ħ': 'H', + u'ħ': 'h', u'ı': 'i', u'IJ': 'IJ', u'ij': 'ij', u'ĸ': 'q', u'Ŀ': 'L', + u'ŀ': 'l', u'Ł': 'L', u'ł': 'l', u'Œ': 'OE', u'œ': 'oe', u'Ŧ': 'T', + u'ŧ': 't', u'DŽ': 'DZ', u'Dž': 'Dz', u'LJ': 'LJ', u'Lj': 'Lj', + u'lj': 'lj', u'NJ': 'NJ', u'Nj': 'Nj', u'nj': 'nj', + u'Ǥ': 'G', u'ǥ': 'g', u'DZ': 'DZ', u'Dz': 'Dz', u'dz': 'dz', + u'Ȥ': 'Z', u'ȥ': 'z', u'№': 'No.', + u'º': 'o.', # normalize Nº abbrev (popular w/ classical music), + # this is 'masculine ordering indicator', not degree +} + +_XLATE_SPECIAL = { + # Translation table. + # Cover additional special characters processing normalization. + u"'": '', # replace apostrophe with nothing + u'&': ' and ', # expand & to ' and ' +} + + +def _translate(s, dictionary): + # type: (basestring,Mapping[basestring,basestring])->basestring + return ''.join(dictionary.get(x, x) for x in s) + + +_COMBINING_RANGES = ( + (0x0300, 0x036f), # Combining Diacritical Marks + (0x1ab0, 0x1aff), # Combining Diacritical Marks Extended + (0x20d0, 0x20ff), # Combining Diacritical Marks for Symbols + (0x1dc0, 0x1dff) # Combining Diacritical Marks Supplement +) + + +def _is_unicode_combining(u): + # type: (unicode)->bool + """ + Check if input unicode is combining diacritical mark. + """ + i = ord(u) + for r in _COMBINING_RANGES: + if r[0] <= i <= r[1]: + return True + return False + + +def _transliterate(u, xlate): + # type: (unicode)->unicode + """ + Perform transliteration using the specified dictionary + """ + u = unicodedata.normalize('NFD', u) + u = u''.join([u'' if _is_unicode_combining(x) else x for x in u]) + u = _translate(u, xlate) + # at this point output is either unicode, or plain ascii + return unicode(u) + + +def clean_name(s): + # type: (basestring)->unicode + """Remove non-alphanumeric characters from the string, perform + normalization and substitution of some special characters; coalesce spaces. + :param s: string to clean up, possibly unicode one. + :return: cleaned-up version of input string. + """ + if not isinstance(s, unicode): + # ignore extended chars if someone was dumb enough to pass non-ascii + # narrow string here, use only unicode for meaningful texts + u = unicode(s, 'ascii', 'replace') + else: + u = s + # 1. don't bother doing normalization NFKC, rather transliterate + # using special translation table + u = _transliterate(u, _XLATE_GRAPHICAL_AND_DIACRITICAL) + # 2. normalize NFKC the result + u = unicodedata.normalize('NFKC', u) + # 3. translate spacials + u = _translate(u, _XLATE_SPECIAL) + # 4. replace any non-alphanumeric character sequences by spaces + u = _CN_RE1.sub(u' ', u) + # 5. coalesce interleaved space/underscore sequences + u = _CN_RE2.sub(u' ', u) + # 6. trim + u = u.strip() + # 7. lowercase + return u def cleanTitle(title): diff --git a/headphones/helpers_test.py b/headphones/helpers_test.py new file mode 100644 index 00000000..2033aab4 --- /dev/null +++ b/headphones/helpers_test.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +from unittestcompat import TestCase +from headphones.helpers import clean_name + + +class HelpersTest(TestCase): + + def test_clean_name(self): + """helpers: check correctness of clean_name() function""" + cases = { + u' Weiße & rose ': 'Weisse and rose', + u'Multiple / spaces': 'Multiple spaces', + u'Kevin\'s m²': 'Kevins m2', + u'Symphonęy Nº9': 'Symphoney No.9', + u'ÆæßðÞIJij': u'AeaessdThIJıj', + u'Obsessió (Cerebral Apoplexy remix)': 'obsessio cerebral ' + 'apoplexy remix', + u'Doktór Hałabała i siedmiu zbojów': 'doktor halabala i siedmiu ' + 'zbojow', + u'Arbetets Söner och Döttrar': 'arbetets soner och dottrar', + u'Björk Guðmundsdóttir': 'bjork gudmundsdottir', + u'L\'Arc~en~Ciel': 'larc en ciel', + u'Orquesta de la Luz (オルケスタ・デ・ラ・ルス)': + u'Orquesta de la Luz オルケスタ デ ラ ルス' + + } + for first, second in cases.iteritems(): + nf = clean_name(first).lower() + ns = clean_name(second).lower() + self.assertEqual( + nf, ns, u"check cleaning of case (%s," + u"%s)" % (nf, ns) + ) + + def test_clean_name_nonunicode(self): + """helpers: check if clean_name() works on non-unicode input""" + input = 'foo $ bar/BAZ' + test = clean_name(input).lower() + expected = 'foo bar baz' + self.assertEqual( + test, expected, "check clean_name() works on non-unicode" + ) + input = 'fóó $ BAZ' + test = clean_name(input).lower() + expected = clean_name('%fóó baz ').lower() + self.assertEqual( + test, expected, "check clean_name() with narrow non-ascii input" + ) diff --git a/headphones/importer.py b/headphones/importer.py index 3cc7f141..9a02ce49 100644 --- a/headphones/importer.py +++ b/headphones/importer.py @@ -374,7 +374,7 @@ def addArtisttoDB(artistid, extrasonly=False, forcefull=False, type="artist"): for track in hybridrelease['Tracks']: - cleanname = helpers.cleanName( + cleanname = helpers.clean_name( artist['artist_name'] + ' ' + rg['title'] + ' ' + track['title']) controlValueDict = {"TrackID": track['id'], @@ -710,7 +710,7 @@ def addReleaseById(rid, rgid=None): myDB.action('INSERT INTO releases VALUES( ?, ?)', [rid, release_dict['rgid']]) for track in release_dict['tracks']: - cleanname = helpers.cleanName( + cleanname = helpers.clean_name( release_dict['artist_name'] + ' ' + release_dict['rg_title'] + ' ' + track['title']) controlValueDict = {"TrackID": track['id'], diff --git a/headphones/librarysync.py b/headphones/librarysync.py index 367d160c..e4e7a53f 100644 --- a/headphones/librarysync.py +++ b/headphones/librarysync.py @@ -138,7 +138,7 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None, # TODO: skip adding songs without the minimum requisite information (just a matter of putting together the right if statements) if f_artist and f.album and f.title: - CleanName = helpers.cleanName(f_artist + ' ' + f.album + ' ' + f.title) + CleanName = helpers.clean_name(f_artist + ' ' + f.album + ' ' + f.title) else: CleanName = None @@ -332,15 +332,15 @@ def libraryScan(dir=None, append=False, ArtistID=None, ArtistName=None, # There was a bug where artists with special characters (-,') would show up in new artists. artist_list = [ x for x in unique_artists - if helpers.cleanName(x).lower() not in [ - helpers.cleanName(y[0]).lower() + if helpers.clean_name(x).lower() not in [ + helpers.clean_name(y[0]).lower() for y in current_artists ] ] artists_checked = [ x for x in unique_artists - if helpers.cleanName(x).lower() in [ - helpers.cleanName(y[0]).lower() + if helpers.clean_name(x).lower() in [ + helpers.clean_name(y[0]).lower() for y in current_artists ] ] diff --git a/headphones/mb.py b/headphones/mb.py index 38b952e6..619908ba 100644 --- a/headphones/mb.py +++ b/headphones/mb.py @@ -637,7 +637,7 @@ def get_new_releases(rgid, includeExtras=False, forcefull=False): for track in release['Tracks']: - cleanname = helpers.cleanName( + cleanname = helpers.clean_name( release['ArtistName'] + ' ' + release['AlbumTitle'] + ' ' + track['title']) controlValueDict = {"TrackID": track['id'], diff --git a/headphones/webserve.py b/headphones/webserve.py index 741a37f4..fc7b2d72 100644 --- a/headphones/webserve.py +++ b/headphones/webserve.py @@ -29,7 +29,7 @@ import urllib2 import os import re from headphones import logger, searcher, db, importer, mb, lastfm, librarysync, helpers, notifiers -from headphones.helpers import checked, radio, today, cleanName +from headphones.helpers import checked, radio, today, clean_name from mako.lookup import TemplateLookup from mako import exceptions import headphones @@ -577,7 +577,7 @@ class WebInterface(object): for albums in have_albums: # Have to skip over manually matched tracks if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']: - original_clean = helpers.cleanName( + original_clean = helpers.clean_name( albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle']) # else: # original_clean = None @@ -595,10 +595,12 @@ class WebInterface(object): # unmatchedalbums = [f for f in have_album_dictionary if f not in [x for x in headphones_album_dictionary]] check = set( - [(cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) for d in + [(clean_name(d['ArtistName']).lower(), + clean_name(d['AlbumTitle']).lower()) for d in headphones_album_dictionary]) unmatchedalbums = [d for d in have_album_dictionary if ( - cleanName(d['ArtistName']).lower(), cleanName(d['AlbumTitle']).lower()) not in check] + clean_name(d['ArtistName']).lower(), + clean_name(d['AlbumTitle']).lower()) not in check] return serve_template(templatename="manageunmatched.html", title="Manage Unmatched Items", unmatchedalbums=unmatchedalbums) @@ -622,8 +624,8 @@ class WebInterface(object): (artist, album)) elif action == "matchArtist": - existing_artist_clean = helpers.cleanName(existing_artist).lower() - new_artist_clean = helpers.cleanName(new_artist).lower() + existing_artist_clean = helpers.clean_name(existing_artist).lower() + new_artist_clean = helpers.clean_name(new_artist).lower() if new_artist_clean != existing_artist_clean: have_tracks = myDB.action( 'SELECT Matched, CleanName, Location, BitRate, Format FROM have WHERE ArtistName=?', @@ -668,10 +670,10 @@ class WebInterface(object): "Artist %s already named appropriately; nothing to modify" % existing_artist) elif action == "matchAlbum": - existing_artist_clean = helpers.cleanName(existing_artist).lower() - new_artist_clean = helpers.cleanName(new_artist).lower() - existing_album_clean = helpers.cleanName(existing_album).lower() - new_album_clean = helpers.cleanName(new_album).lower() + existing_artist_clean = helpers.clean_name(existing_artist).lower() + new_artist_clean = helpers.clean_name(new_artist).lower() + existing_album_clean = helpers.clean_name(existing_album).lower() + new_album_clean = helpers.clean_name(new_album).lower() existing_clean_string = existing_artist_clean + " " + existing_album_clean new_clean_string = new_artist_clean + " " + new_album_clean if existing_clean_string != new_clean_string: @@ -728,7 +730,7 @@ class WebInterface(object): 'SELECT ArtistName, AlbumTitle, TrackTitle, CleanName, Matched from have') for albums in manualalbums: if albums['ArtistName'] and albums['AlbumTitle'] and albums['TrackTitle']: - original_clean = helpers.cleanName( + original_clean = helpers.clean_name( albums['ArtistName'] + " " + albums['AlbumTitle'] + " " + albums['TrackTitle']) if albums['Matched'] == "Ignored" or albums['Matched'] == "Manual" or albums[ 'CleanName'] != original_clean: @@ -769,7 +771,7 @@ class WebInterface(object): [artist]) update_count = 0 for tracks in update_clean: - original_clean = helpers.cleanName( + original_clean = helpers.clean_name( tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[ 'TrackTitle']).lower() album = tracks['AlbumTitle'] @@ -797,7 +799,7 @@ class WebInterface(object): (artist, album)) update_count = 0 for tracks in update_clean: - original_clean = helpers.cleanName( + original_clean = helpers.clean_name( tracks['ArtistName'] + " " + tracks['AlbumTitle'] + " " + tracks[ 'TrackTitle']).lower() track_title = tracks['TrackTitle']