headphones/headphones/lyrics.py

import re
import urllib
from xml.dom import minidom
import htmlentitydefs

from headphones import logger

def getLyrics(artist, song):

	params = {	"artist": artist.encode('utf-8'),
				"song": song.encode('utf-8'),
				"fmt": 'xml'
                }

	searchURL = 'http://lyrics.wikia.com/api.php?' + urllib.urlencode(params)

	try:
		data = urllib.urlopen(searchURL).read()
	except Exception, e:
		logger.warn('Error opening: %s. Error: %s' % (searchURL, e))
		return

	try:
		parseddata = minidom.parseString(data)
	except Exception, e:
		logger.warn('Error parsing data from url: %s. Error: %s' % (searchURL, e))
		return

	url = parseddata.getElementsByTagName("url")

	if url:
		lyricsurl = url[0].firstChild.nodeValue
	else:
		logger.info('No lyrics found for %s - %s' % (artist, song))
		return

	try:
		lyricspage = urllib.urlopen(lyricsurl).read()
	except Exception, e:
		logger.warn('Error fetching lyrics from: %s. Error: %s' % (lyricsurl, e))
		return

	m = re.compile('''<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--''').search(lyricspage)

	if not m:
		m = re.compile('''<div class='lyricbox'><span style="padding:1em"><a href="/Category:Instrumental" title="Instrumental">''').search(lyricspage)
		if m:
			return u'(Instrumental)'
		else:
			logger.warn('Cannot find lyrics on: %s' % lyricsurl)
			return

	lyrics = convert_html_entities(m.group(1)).replace('<br />', '\n')
	lyrics = re.sub('<.*?>', '', lyrics)

	return lyrics

def convert_html_entities(s):
    matches = re.findall("&#\d+;", s)
    if len(matches) > 0:
        hits = set(matches)
        for hit in hits:
                name = hit[2:-1]
                try:
                        entnum = int(name)
                        s = s.replace(hit, unichr(entnum))
                except ValueError:
                        pass

    matches = re.findall("&\w+;", s)
    hits = set(matches)
    amp = "&amp;"
    if amp in hits:
        hits.remove(amp)
    for hit in hits:
        name = hit[1:-1]
        if htmlentitydefs.name2codepoint.has_key(name):
                s = s.replace(hit, unichr(htmlentitydefs.name2codepoint[name]))
    s = s.replace(amp, "&")
    return s