rutracker revision

- Now uses requests with more logging - Update to latest BeautifulSoup and html5lib libs
2026-07-08 10:03:59 +01:00 · 2015-08-02 12:18:25 +12:00
parent d90a31afc7
commit d2782179aa
28 changed files with 1268 additions and 933 deletions
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+
+import urllib
+import requests as requests
+from urlparse import urlparse
+from bs4 import BeautifulSoup
+
+import os
+import re
+
+import headphones
+from headphones import logger
+
+class Rutracker(object):
+
+    def __init__(self):
+        self.session = requests.session()
+        self.timeout = 60
+        self.loggedin = False
+        self.maxsize = 0
+        self.search_referer = 'http://rutracker.org/forum/tracker.php'
+
+    def logged_in(self):
+        return self.loggedin
+
+    def still_logged_in(self, html):
+        if not html or "action=\"http://login.rutracker.org/forum/login.php\">" in html:
+            return False
+        else:
+            return True
+
+    def login(self):
+        """
+        Logs in user
+        """
+
+        loginpage = 'http://login.rutracker.org/forum/login.php'
+        post_params = {
+            'login_username': headphones.CONFIG.RUTRACKER_USER,
+            'login_password': headphones.CONFIG.RUTRACKER_PASSWORD,
+            'login': b'\xc2\xf5\xee\xe4'  # '%C2%F5%EE%E4'
+        }
+
+        logger.info("Attempting to log in to rutracker...")
+
+        # User agent doesn't seem to matter?
+        #self.headers['User-Agent'] = self.useragents[random.randrange(0, len(self.useragents))]
+        try:
+            r = self.session.post(loginpage, data=post_params, timeout=self.timeout)
+            if r.status_code != 200:
+                logger.error("rutracker login returned status code %s" % r.status_code)
+                self.loggedin = False
+            else:
+                if 'bb_data' in r.cookies.keys():
+                    self.loggedin = True
+                    logger.info("Successfully logged in to rutracker")
+                else:
+                    logger.error("Could not login to rutracker, credentials maybe incorrect, " /
+                                 "site is down or too many attempts")
+                    self.loggedin = False
+            return self.loggedin
+        except Exception as e:
+            logger.error("Unknown error logging in to rutracker: %s" % e)
+            self.loggedin = False
+            return self.loggedin
+
+    def searchurl(self, artist, album, year, format):
+        """
+        Return the search url
+        """
+
+        # Build search url
+        searchterm = ''
+        if artist != 'Various Artists':
+            searchterm = artist
+            searchterm = searchterm + ' '
+        searchterm = searchterm + album
+        searchterm = searchterm + ' '
+        searchterm = searchterm + year
+
+        if format == 'lossless':
+            format = '+lossless'
+            self.maxsize = 10000000000
+        elif format == 'lossless+mp3':
+            format = '+lossless||mp3||aac'
+            self.maxsize = 10000000000
+        else:
+            format = '+mp3||aac'
+            self.maxsize = 300000000
+
+        # sort by size, descending.
+        sort = '&o=7&s=2'
+
+        searchurl = "%s?nm=%s%s%s" % (self.search_referer, urllib.quote(searchterm), format, sort)
+
+        logger.info("Searching rutracker using term: %s", searchterm)
+
+        return searchurl
+
+    def search(self, searchurl):
+        """
+        Parse the search results and return valid torrent list
+        """
+
+        try:
+            headers = {'Referer': self.search_referer}
+            r = self.session.get(url=searchurl, headers=headers, timeout=self.timeout)
+
+            soup = BeautifulSoup(r.content, 'html5lib')
+
+            # Debug
+            #logger.debug (soup.prettify())
+
+            # Check if still logged in
+            if not self.still_logged_in(soup):
+                self.login()
+                r = self.session.get(url=searchurl, timeout=self.timeout)
+                soup = BeautifulSoup(r.content, 'html5lib')
+                if not self.still_logged_in(soup):
+                    logger.error("Error getting rutracker data")
+                    return None
+
+            # Process
+            rulist = []
+            i = soup.find('table', id='tor-tbl')
+            if not i:
+                logger.info("No valid results found from rutracker")
+                return None
+            minimumseeders = int(headphones.CONFIG.NUMBEROFSEEDERS) - 1
+
+            for item in zip(i.find_all(class_='hl-tags'),i.find_all(class_='dl-stub'),i.find_all(class_='seedmed')):
+                title = item[0].get_text()
+                url = item[1].get('href')
+                size_formatted = item[1].get_text()[:-2]
+                seeds = item[2].get_text()
+                size_parts = size_formatted.split()
+                size = float(size_parts[0])
+
+                if size_parts[1] == 'KB':
+                    size *= 1024
+                if size_parts[1] == 'MB':
+                    size *= 1024 ** 2
+                if size_parts[1] == 'GB':
+                    size *= 1024 ** 3
+                if size_parts[1] == 'TB':
+                    size *= 1024 ** 4
+
+                if size < self.maxsize and minimumseeders < int(seeds):
+                    logger.info('Found %s. Size: %s' % (title, size_formatted))
+                    #Torrent topic page
+                    torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
+                    topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
+                    rulist.append((title, size, topicurl, 'rutracker.org', 'torrent', True))
+                else:
+                    logger.info("%s is larger than the maxsize or has too little seeders for this category, " \
+                                "skipping. (Size: %i bytes, Seeders: %i)" % (title, size, int(seeds)))
+
+            if not rulist:
+                logger.info("No valid results found from rutracker")
+
+            return rulist
+
+        except Exception as e:
+            logger.error("An unknown error occurred in the rutracker parser: %s" % e)
+            return None
+
+
+    def get_torrent_data(self, url):
+        """
+        return the .torrent data
+        """
+
+        torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
+        downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
+        cookie = {'bb_dl': torrent_id}
+        try:
+            headers = {'Referer': url}
+            r = self.session.get(url=downloadurl, cookies=cookie, headers=headers, timeout=self.timeout)
+            return r.content
+        except Exception as e:
+            logger.error('Error getting torrent: %s', e)
+            return False
+
+
+    #TODO get this working in utorrent.py
+    def utorrent_add_file(self, data):
+
+        host = headphones.CONFIG.UTORRENT_HOST
+        if not host.startswith('http'):
+            host = 'http://' + host
+        if host.endswith('/'):
+            host = host[:-1]
+        if host.endswith('/gui'):
+            host = host[:-4]
+
+        base_url = host
+
+        url = base_url + '/gui/'
+        self.session.auth = (headphones.CONFIG.UTORRENT_USERNAME, headphones.CONFIG.UTORRENT_PASSWORD)
+
+        try:
+            r = self.session.get(url + 'token.html')
+        except Exception as e:
+            logger.error('Error getting token: %s', e)
+            return
+
+        if r.status_code == 401:
+            logger.debug('Error reaching utorrent')
+            return
+
+        regex = re.search(r'.+>([^<]+)</div></html>', r.text)
+        if regex is None:
+            logger.debug('Error reading token')
+            return
+
+        self.session.params = {'token': regex.group(1)}
+        files = {'torrent_file': ("", data)}
+
+        try:
+            self.session.post(url, params={'action': 'add-file'}, files=files)
+        except Exception as e:
+            logger.exception('Error adding file to utorrent %s', e)
+
@@ -36,12 +36,10 @@ import unicodedata

 from headphones.common import USER_AGENT
 from headphones import logger, db, helpers, classes, sab, nzbget, request
-from headphones import utorrent, transmission, notifiers
+from headphones import utorrent, transmission, notifiers, rutracker

 from bencode import bencode, bdecode

-import headphones.searcher_rutracker as rutrackersearch
-
 # Magnet to torrent services, for Black hole. Stolen from CouchPotato.
 TORRENT_TO_MAGNET_SERVICES = [
    'https://zoink.it/torrent/%s.torrent',
@@ -51,9 +49,7 @@ TORRENT_TO_MAGNET_SERVICES = [

 # Persistent What.cd API object
 gazelle = None
-
-# RUtracker search object
-rutracker = rutrackersearch.Rutracker()
+ruobj = None


 def fix_url(s, charset="utf-8"):
@@ -818,15 +814,9 @@ def send_to_downloader(data, bestqual, album):
                        "to open or convert magnet links")
                    return
            else:
-                if bestqual[3] == "rutracker.org":
-                    download_path, _ = rutracker.get_torrent(bestqual[2],
-                        headphones.CONFIG.TORRENTBLACKHOLE_DIR)

-                    if not download_path:
-                        return
-                else:
-                    if not torrent_to_file(download_path, data):
-                        return
+                if not torrent_to_file(download_path, data):
+                    return

                # Extract folder name from torrent
                folder_name = read_torrent_name(download_path, bestqual[0])
@@ -836,13 +826,11 @@ def send_to_downloader(data, bestqual, album):
        elif headphones.CONFIG.TORRENT_DOWNLOADER == 1:
            logger.info("Sending torrent to Transmission")

-            # rutracker needs cookies to be set, pass the .torrent file instead of url
+            # Add torrent
            if bestqual[3] == 'rutracker.org':
-                file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
+                torrentid = transmission.addTorrent('', data)
            else:
-                file_or_url = bestqual[2]
-
-            torrentid = transmission.addTorrent(file_or_url)
+                torrentid = transmission.addTorrent(bestqual[2])

            if not torrentid:
                logger.error("Error sending torrent to Transmission. Are you sure it's running?")
@@ -855,13 +843,6 @@ def send_to_downloader(data, bestqual, album):
                logger.error('Torrent folder name could not be determined')
                return

-            # remove temp .torrent file created above
-            if bestqual[3] == 'rutracker.org':
-                try:
-                    shutil.rmtree(os.path.split(file_or_url)[0])
-                except Exception as e:
-                    logger.exception("Unhandled exception")
-
            # Set Seed Ratio
            seed_ratio = get_seed_ratio(bestqual[3])
            if seed_ratio is not None:
@@ -870,30 +851,30 @@ def send_to_downloader(data, bestqual, album):
        else:# if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
            logger.info("Sending torrent to uTorrent")

-            # rutracker needs cookies to be set, pass the .torrent file instead of url
+            # Add torrent
            if bestqual[3] == 'rutracker.org':
-                file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
-                folder_name, cacheid = utorrent.dirTorrent(torrentid)
-                folder_name = os.path.basename(os.path.normpath(folder_name))
-                utorrent.labelTorrent(torrentid)
+                ruobj.utorrent_add_file(data)
            else:
-                file_or_url = bestqual[2]
-                torrentid = calculate_torrent_hash(file_or_url, data)
-                folder_name = utorrent.addTorrent(file_or_url, torrentid)
+                utorrent.addTorrent(bestqual[2])

+            # Get hash
+            torrentid = calculate_torrent_hash(bestqual[2], data)
+            if not torrentid:
+                logger.error('Torrent id could not be determined')
+                return
+
+            # Set Label
+            if headphones.CONFIG.UTORRENT_LABEL:
+                utorrent.labelTorrent(torrentid)
+
+            # Get folder
+            folder_name = utorrent.getFolder(torrentid)
            if folder_name:
                logger.info('Torrent folder name: %s' % folder_name)
            else:
                logger.error('Torrent folder name could not be determined')
                return

-            # remove temp .torrent file created above
-            if bestqual[3] == 'rutracker.org':
-                try:
-                    shutil.rmtree(os.path.split(file_or_url)[0])
-                except Exception as e:
-                    logger.exception("Unhandled exception")
-
            # Set Seed Ratio
            seed_ratio = get_seed_ratio(bestqual[3])
            if seed_ratio is not None:
@@ -1041,12 +1022,7 @@ def verifyresult(title, artistterm, term, lossless):

 def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose_specific_download=False):
    global gazelle  # persistent what.cd api object to reduce number of login attempts
-
-    # rutracker login
-    if headphones.CONFIG.RUTRACKER and album:
-        rulogin = rutracker.login(headphones.CONFIG.RUTRACKER_USER, headphones.CONFIG.RUTRACKER_PASSWORD)
-        if not rulogin:
-            logger.info(u'Could not login to rutracker, search results will exclude this provider')
+    global ruobj    # and rutracker

    albumid = album['AlbumID']
    reldate = album['ReleaseDate']
@@ -1239,45 +1215,38 @@ def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose
                        logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s", e)

    # rutracker.org
-    if headphones.CONFIG.RUTRACKER and rulogin:
+    if headphones.CONFIG.RUTRACKER:
        provider = "rutracker.org"

        # Ignore if release date not specified, results too unpredictable
        if not year and not usersearchterm:
-            logger.info(u'Release date not specified, ignoring for rutracker.org')
+            logger.info(u"Release date not specified, ignoring for rutracker.org")
        else:
-
            if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly:
                format = 'lossless'
-                maxsize = 10000000000
            elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless:
                format = 'lossless+mp3'
-                maxsize = 10000000000
            else:
                format = 'mp3'
-                maxsize = 300000000

-            # build search url based on above
-            if not usersearchterm:
-                searchURL = rutracker.searchurl(artistterm, albumterm, year, format)
-            else:
-                searchURL = rutracker.searchurl(usersearchterm, ' ', ' ', format)
+            # Login
+            if not ruobj or not ruobj.logged_in():
+                ruobj = rutracker.Rutracker()
+                if not ruobj.login():
+                    ruobj = None

-            logger.info(u'Parsing results from <a href="%s">rutracker.org</a>' % searchURL)
+            if ruobj and ruobj.logged_in():

-            # parse results and get best match
-            rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid)
+                # build search url
+                if not usersearchterm:
+                    searchURL = ruobj.searchurl(artistterm, albumterm, year, format)
+                else:
+                    searchURL = ruobj.searchurl(usersearchterm, ' ', ' ', format)

-            # add best match to overall results list
-            if rulist:
-                for ru in rulist:
-                    title = ru[0].decode('utf-8')
-                    size = ru[1]
-                    url = ru[2]
-                    resultlist.append((title, size, url, provider, 'torrent', True))
-                    logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
-            else:
-                logger.info(u"No valid results found from %s" % (provider))
+                # parse results
+                rulist = ruobj.search(searchURL)
+                if rulist:
+                    resultlist.extend(rulist)

    if headphones.CONFIG.WHATCD:
        provider = "What.cd"
@@ -1567,12 +1536,14 @@ def preprocess(resultlist):

    for result in resultlist:
        if result[4] == 'torrent':
+
+            # rutracker always needs the torrent data
+            if result[3] == 'rutracker.org':
+                return ruobj.get_torrent_data(result[2]), result
+
            #Get out of here if we're using Transmission
            if headphones.CONFIG.TORRENT_DOWNLOADER == 1:  ## if not a magnet link still need the .torrent to generate hash... uTorrent support labeling
                return True, result
-            # get outta here if rutracker
-            if result[3] == 'rutracker.org':
-                return True, result
            # Get out of here if it's a magnet link
            if result[2].lower().startswith("magnet:"):
                return True, result
@@ -1,349 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Headphones rutracker.org search
-# Functions called from searcher.py
-
-from bencode import bencode as bencode, bdecode
-from urlparse import urlparse
-from bs4 import BeautifulSoup
-from tempfile import mkdtemp
-from hashlib import sha1
-
-import headphones
-import requests
-import cookielib
-import urllib2
-import urllib
-import re
-import os
-
-from headphones import db, logger
-
-
-class Rutracker():
-
-    logged_in = False
-
-    # Stores a number of login attempts to prevent recursion.
-    #login_counter = 0
-
-    def __init__(self):
-
-        self.cookiejar = cookielib.CookieJar()
-        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar))
-        urllib2.install_opener(self.opener)
-
-    def login(self, login, password):
-        """Implements tracker login procedure."""
-
-        self.logged_in = False
-
-        if login is None or password is None:
-            return False
-
-        #self.login_counter += 1
-
-        # No recursion wanted.
-        #if self.login_counter > 1:
-        #    return False
-
-        params = urllib.urlencode({"login_username": login,
-                                   "login_password": password,
-                                   "login": "Вход"})
-
-        try:
-            self.opener.open("http://login.rutracker.org/forum/login.php", params)
-        except Exception:
-            pass
-
-        # Check if we're logged in
-        for cookie in self.cookiejar:
-            if cookie.name == 'bb_data':
-                self.logged_in = True
-
-        return self.logged_in
-
-    def searchurl(self, artist, album, year, format):
-        """
-        Return the search url
-        """
-
-        # Build search url
-        searchterm = ''
-        if artist != 'Various Artists':
-            searchterm = artist
-            searchterm = searchterm + ' '
-        searchterm = searchterm + album
-        searchterm = searchterm + ' '
-        searchterm = searchterm + year
-
-        providerurl = "http://rutracker.org/forum/tracker.php"
-
-        if format == 'lossless':
-            format = '+lossless'
-        elif format == 'lossless+mp3':
-            format = '+lossless||mp3||aac'
-        else:
-            format = '+mp3||aac'
-
-        # sort by size, descending.
-        sort = '&o=7&s=2'
-
-        searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort)
-
-        return searchurl
-
-    def search(self, searchurl, maxsize, minseeders, albumid):
-        """
-        Parse the search results and return valid torrent list
-        """
-
-        titles = []
-        urls = []
-        seeders = []
-        sizes = []
-        torrentlist = []
-        rulist = []
-
-        try:
-
-            page = self.opener.open(searchurl, timeout=60)
-            soup = BeautifulSoup(page.read())
-
-            # Debug
-            #logger.debug (soup.prettify())
-
-            # Title
-            for link in soup.find_all('a', attrs={'class': 'med tLink hl-tags bold'}):
-                title = link.get_text()
-                titles.append(title)
-
-            # Download URL
-            for link in soup.find_all('a', attrs={'class': 'small tr-dl dl-stub'}):
-                url = link.get('href')
-                urls.append(url)
-
-            # Seeders
-            for link in soup.find_all('b', attrs={'class': 'seedmed'}):
-                seeder = link.get_text()
-                seeders.append(seeder)
-
-            # Size
-            for link in soup.find_all('td', attrs={'class': 'row4 small nowrap tor-size'}):
-                size = link.u.string
-                sizes.append(size)
-
-        except:
-            pass
-
-        # Combine lists
-        torrentlist = zip(titles, urls, seeders, sizes)
-
-        # return if nothing found
-        if not torrentlist:
-            return False
-
-        # don't bother checking track counts anymore, let searcher filter instead
-        # leave code in just in case
-        check_track_count = False
-
-        if check_track_count:
-
-            # get headphones track count for album, return if not found
-            myDB = db.DBConnection()
-            tracks = myDB.select('SELECT * from tracks WHERE AlbumID=?', [albumid])
-            hptrackcount = len(tracks)
-
-            if not hptrackcount:
-                logger.info('headphones track info not found, cannot compare to torrent')
-                return False
-
-            # Return all valid entries, ignored, required words now checked in searcher.py
-
-            #unwantedlist = ['promo', 'vinyl', '[lp]', 'songbook', 'tvrip', 'hdtv', 'dvd']
-
-            formatlist = ['ape', 'flac', 'ogg', 'm4a', 'aac', 'mp3', 'wav', 'aif']
-            deluxelist = ['deluxe', 'edition', 'japanese', 'exclusive']
-
-        for torrent in torrentlist:
-
-            returntitle = torrent[0].encode('utf-8')
-            url = torrent[1]
-            seeders = torrent[2]
-            size = torrent[3]
-
-            if int(size) <= maxsize and int(seeders) >= minseeders:
-
-                #Torrent topic page
-                torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
-                topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
-
-                # add to list
-                if not check_track_count:
-                    valid = True
-                else:
-
-                    # Check torrent info
-                    self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
-
-                    # Debug
-                    #for cookie in self.cookiejar:
-                    #    logger.debug ('Cookie: %s' % cookie)
-
-                    try:
-                        page = self.opener.open(url)
-                        torrent = page.read()
-                        if torrent:
-                            decoded = bdecode(torrent)
-                            metainfo = decoded['info']
-                        page.close()
-                    except Exception as e:
-                        logger.error('Error getting torrent: %s' % e)
-                        return False
-
-                    # get torrent track count and check for cue
-                    trackcount = 0
-                    cuecount = 0
-
-                    if 'files' in metainfo: # multi
-                        for pathfile in metainfo['files']:
-                            path = pathfile['path']
-                            for file in path:
-                                if any(file.lower().endswith('.' + x.lower()) for x in formatlist):
-                                    trackcount += 1
-                                if '.cue' in file:
-                                    cuecount += 1
-
-                    title = returntitle.lower()
-                    logger.debug('torrent title: %s' % title)
-                    logger.debug('headphones trackcount: %s' % hptrackcount)
-                    logger.debug('rutracker trackcount: %s' % trackcount)
-
-                    # If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s)
-                    # This is for the case where we have a single .flac/.wav which can be split by cue
-                    # Not great, but shouldn't be doing this too often
-                    totallogcount = 0
-                    if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount:
-                        page = self.opener.open(topicurl, timeout=60)
-                        soup = BeautifulSoup(page.read())
-                        findtoc = soup.find_all(text='TOC of the extracted CD')
-                        if not findtoc:
-                            findtoc = soup.find_all(text='TOC извлечённого CD')
-                        for toc in findtoc:
-                            logcount = 0
-                            for toccontent in toc.find_all_next(text=True):
-                                cut_string = toccontent.split('|')
-                                new_string = cut_string[0].lstrip().rstrip()
-                                if new_string == '1' or new_string == '01':
-                                    logcount = 1
-                                elif logcount > 0:
-                                    if new_string.isdigit():
-                                        logcount += 1
-                                    else:
-                                        break
-                            totallogcount = totallogcount + logcount
-
-                    if totallogcount > 0:
-                        trackcount = totallogcount
-                        logger.debug('rutracker logtrackcount: %s' % totallogcount)
-
-                    # If torrent track count = hp track count then return torrent,
-                    # if greater, check for deluxe/special/foreign editions
-                    # if less, then allow if it's a single track with a cue
-                    valid = False
-
-                    if trackcount == hptrackcount:
-                        valid = True
-                    elif trackcount > hptrackcount:
-                        if any(deluxe in title for deluxe in deluxelist):
-                            valid = True
-
-                # Add to list
-                if valid:
-                    rulist.append((returntitle, size, topicurl))
-                else:
-                    if topicurl:
-                        logger.info(u'<a href="%s">Torrent</a> found with %s tracks but the selected headphones release has %s tracks, skipping for rutracker.org' % (topicurl, trackcount, hptrackcount))
-            else:
-                logger.info('%s is larger than the maxsize or has too little seeders for this category, skipping. (Size: %i bytes, Seeders: %i)' % (returntitle, int(size), int(seeders)))
-
-        return rulist
-
-    def get_torrent(self, url, savelocation=None):
-
-        torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
-        self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
-        downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
-        torrent_name = torrent_id + '.torrent'
-
-        try:
-            prev = os.umask(headphones.UMASK)
-            page = self.opener.open(downloadurl)
-            torrent = page.read()
-            decoded = bdecode(torrent)
-            metainfo = decoded['info']
-            tor_hash = sha1(bencode(metainfo)).hexdigest()
-            if savelocation:
-                download_path = os.path.join(savelocation, torrent_name)
-            else:
-                tempdir = mkdtemp(suffix='_rutracker_torrents')
-                download_path = os.path.join(tempdir, torrent_name)
-
-            with open(download_path, 'wb') as f:
-                f.write(torrent)
-            os.umask(prev)
-
-            # Add file to utorrent
-            if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
-                self.utorrent_add_file(download_path)
-
-        except Exception as e:
-            logger.error('Error getting torrent: %s', e)
-            return False
-
-        return download_path, tor_hash
-
-    #TODO get this working in utorrent.py
-    def utorrent_add_file(self, filename):
-
-        host = headphones.CONFIG.UTORRENT_HOST
-        if not host.startswith('http'):
-            host = 'http://' + host
-        if host.endswith('/'):
-            host = host[:-1]
-        if host.endswith('/gui'):
-            host = host[:-4]
-
-        base_url = host
-        username = headphones.CONFIG.UTORRENT_USERNAME
-        password = headphones.CONFIG.UTORRENT_PASSWORD
-
-        session = requests.Session()
-        url = base_url + '/gui/'
-        session.auth = (username, password)
-
-        try:
-            r = session.get(url + 'token.html')
-        except Exception:
-            logger.exception('Error getting token')
-            return
-
-        if r.status_code == '401':
-            logger.debug('Error reaching utorrent')
-            return
-
-        regex = re.search(r'.+>([^<]+)</div></html>', r.text)
-        if regex is None:
-            logger.debug('Error reading token')
-            return
-
-        session.params = {'token': regex.group(1)}
-
-        with open(filename, 'rb') as f:
-            try:
-                session.post(url, params={'action': 'add-file'},
-                    files={'torrent_file': f})
-            except Exception:
-                logger.exception('Error adding file to utorrent')
-                return
@@ -28,12 +28,15 @@ import headphones
 #       Store torrent id so we can check up on it


-def addTorrent(link):
+def addTorrent(link, data=None):
    method = 'torrent-add'

-    if link.endswith('.torrent'):
-        with open(link, 'rb') as f:
-            metainfo = str(base64.b64encode(f.read()))
+    if link.endswith('.torrent') or data:
+        if data:
+            metainfo = str(base64.b64encode(data))
+        else:
+            with open(link, 'rb') as f:
+                metainfo = str(base64.b64encode(f.read()))
        arguments = {'metainfo': metainfo, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}
    else:
        arguments = {'filename': link, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}
@@ -220,7 +220,7 @@ def dirTorrent(hash, cacheid=None, return_name=None):
    cacheid = torrentList['torrentc']

    for torrent in torrents:
-        if torrent[0].upper() == hash:
+        if torrent[0].upper() == hash.upper():
            if not return_name:
                return torrent[26], cacheid
            else:
@@ -228,8 +228,12 @@ def dirTorrent(hash, cacheid=None, return_name=None):

    return None, None

+def addTorrent(link):
+    uTorrentClient = utorrentclient()
+    uTorrentClient.add_url(link)

-def addTorrent(link, hash):
+
+def getFolder(hash):
    uTorrentClient = utorrentclient()

    # Get Active Directory from settings
@@ -239,8 +243,6 @@ def addTorrent(link, hash):
        logger.error('Could not get "Put new downloads in:" directory from uTorrent settings, please ensure it is set')
        return None

-    uTorrentClient.add_url(link)
-
    # Get Torrent Folder Name
    torrent_folder, cacheid = dirTorrent(hash)

@@ -254,10 +256,8 @@ def addTorrent(link, hash):

    if torrent_folder == active_dir or not torrent_folder:
        torrent_folder, cacheid = dirTorrent(hash, cacheid, return_name=True)
-        labelTorrent(hash)
        return torrent_folder
    else:
-        labelTorrent(hash)
        if headphones.SYS_PLATFORM != "win32":
            torrent_folder = torrent_folder.replace('\\', '/')
        return os.path.basename(os.path.normpath(torrent_folder))
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.3.2"
-__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
+__version__ = "4.4.0"
+__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']
@@ -45,7 +45,7 @@ from .element import (

 # The very first thing we do is give a useful error if someone is
 # running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'

 class BeautifulSoup(Tag):
    """
@@ -77,8 +77,11 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+
    def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
+                 parse_only=None, from_encoding=None, exclude_encodings=None,
+                 **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""
@@ -114,9 +117,9 @@ class BeautifulSoup(Tag):
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
-                "or features='xml' to get a builder capable of handling "
-                "one or the other.")
+                "BeautifulSoup constructor. Suggest you use "
+                "features='lxml' for HTML and features='lxml-xml' for "
+                "XML.")

        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
@@ -140,6 +143,7 @@ class BeautifulSoup(Tag):
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
+            original_features = features
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
@@ -151,6 +155,16 @@ class BeautifulSoup(Tag):
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
+            if not (original_features == builder.NAME or
+                    original_features in builder.ALTERNATE_NAMES):
+                if builder.is_xml:
+                    markup_type = "XML"
+                else:
+                    markup_type = "HTML"
+                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                    parser=builder.NAME,
+                    markup_type=markup_type))
+
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self
@@ -178,6 +192,8 @@ class BeautifulSoup(Tag):
                # system. Just let it go.
                pass
            if is_file:
+                if isinstance(markup, unicode):
+                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
            if markup[:5] == "http:" or markup[:6] == "https:":
@@ -185,12 +201,15 @@ class BeautifulSoup(Tag):
                # Python 3 otherwise.
                if ((isinstance(markup, bytes) and not b' ' in markup)
                    or (isinstance(markup, unicode) and not u' ' in markup)):
+                    if isinstance(markup, unicode):
+                        markup = markup.encode("utf8")
                    warnings.warn(
                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
-            self.builder.prepare_markup(markup, from_encoding)):
+             self.builder.prepare_markup(
+                 markup, from_encoding, exclude_encodings=exclude_encodings)):
            self.reset()
            try:
                self._feed()
@@ -203,6 +222,16 @@ class BeautifulSoup(Tag):
        self.markup = None
        self.builder.soup = None

+    def __copy__(self):
+        return type(self)(self.encode(), builder=self.builder)
+
+    def __getstate__(self):
+        # Frequently a tree builder can't be pickled.
+        d = dict(self.__dict__)
+        if 'builder' in d and not self.builder.picklable:
+            del d['builder']
+        return d
+
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
@@ -229,9 +258,7 @@ class BeautifulSoup(Tag):

    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
-        navigable = subclass(s)
-        navigable.setup()
-        return navigable
+        return subclass(s)

    def insert_before(self, successor):
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@@ -290,14 +317,49 @@ class BeautifulSoup(Tag):
    def object_was_parsed(self, o, parent=None, most_recent_element=None):
        """Add an object to the parse tree."""
        parent = parent or self.currentTag
-        most_recent_element = most_recent_element or self._most_recent_element
-        o.setup(parent, most_recent_element)
+        previous_element = most_recent_element or self._most_recent_element
+
+        next_element = previous_sibling = next_sibling = None
+        if isinstance(o, Tag):
+            next_element = o.next_element
+            next_sibling = o.next_sibling
+            previous_sibling = o.previous_sibling
+            if not previous_element:
+                previous_element = o.previous_element
+
+        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)

-        if most_recent_element is not None:
-            most_recent_element.next_element = o
        self._most_recent_element = o
        parent.contents.append(o)

+        if parent.next_sibling:
+            # This node is being inserted into an element that has
+            # already been parsed. Deal with any dangling references.
+            index = parent.contents.index(o)
+            if index == 0:
+                previous_element = parent
+                previous_sibling = None
+            else:
+                previous_element = previous_sibling = parent.contents[index-1]
+            if index == len(parent.contents)-1:
+                next_element = parent.next_sibling
+                next_sibling = None
+            else:
+                next_element = next_sibling = parent.contents[index+1]
+
+            o.previous_element = previous_element
+            if previous_element:
+                previous_element.next_element = o
+            o.next_element = next_element
+            if next_element:
+                next_element.previous_element = o
+            o.next_sibling = next_sibling
+            if next_sibling:
+                next_sibling.previous_sibling = o
+            o.previous_sibling = previous_sibling
+            if previous_sibling:
+                previous_sibling.next_sibling = o
+
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
        instance of the given tag. If inclusivePop is false, pops the tag
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""

+    NAME = "[Unknown tree builder]"
+    ALTERNATE_NAMES = []
    features = []

    is_xml = False
+    picklable = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
@@ -2,6 +2,7 @@ __all__ = [
    'HTML5TreeBuilder',
    ]

+from pdb import set_trace
 import warnings
 from bs4.builder import (
    PERMISSIVE,
@@ -9,7 +10,10 @@ from bs4.builder import (
    HTML_5,
    HTMLTreeBuilder,
    )
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+    NamespacedAttribute,
+    whitespace_re,
+)
 import html5lib
 from html5lib.constants import namespaces
 from bs4.element import (
@@ -22,11 +26,20 @@ from bs4.element import (
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+    NAME = "html5lib"

-    def prepare_markup(self, markup, user_specified_encoding):
+    features = [NAME, PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding,
+                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        if exclude_encodings:
+            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
        yield (markup, None, None, False)

    # These methods are defined by Beautiful Soup.
@@ -101,7 +114,13 @@ class AttrList(object):
    def __iter__(self):
        return list(self.attrs.items()).__iter__()
    def __setitem__(self, name, value):
-        "set attr", name, value
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        if (name in list_attr['*']
+            or (self.element.name in list_attr
+                and name in list_attr[self.element.name])):
+            value = whitespace_re.split(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
@@ -161,6 +180,12 @@ class Element(html5lib.treebuilders._base.Node):
            # immediately after the parent, if it has no children.)
            if self.element.contents:
                most_recent_element = self.element._last_descendant(False)
+            elif self.element.next_element is not None:
+                # Something from further ahead in the parse tree is
+                # being inserted into this earlier element. This is
+                # very annoying because it means an expensive search
+                # for the last element in the tree.
+                most_recent_element = self.soup._last_descendant()
            else:
                most_recent_element = self.element

@@ -172,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node):
        return AttrList(self.element)

    def setAttributes(self, attributes):
+
        if attributes is not None and len(attributes) > 0:

            converted_attributes = []
@@ -218,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node):

    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
+        # print "MOVE", self.element.contents
+        # print "FROM", self.element
+        # print "TO", new_parent.element
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@@ -236,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element

        to_append = element.contents
-        append_after = new_parent.element.contents
+        append_after = new_parent_element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
            first_child = to_append[0]
-            first_child.previous_element = new_parents_last_descendant
+            if new_parents_last_descendant:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child:
+                new_parents_last_child.next_sibling = first_child

            # Fix the last child's next_element and next_sibling
            last_child = to_append[-1]
            last_child.next_element = new_parents_last_descendant_next_element
+            if new_parents_last_descendant_next_element:
+                new_parents_last_descendant_next_element.previous_element = last_child
            last_child.next_sibling = None

        for child in to_append:
@@ -257,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node):
        element.contents = []
        element.next_element = final_next_element

+        # print "DONE WITH MOVE"
+        # print "FROM", self.element
+        # print "TO", new_parent_element
+
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
        node = Element(tag, self.soup, self.namespace)
@@ -268,7 +312,7 @@ class Element(html5lib.treebuilders._base.Node):
        return self.element.contents

    def getNameTuple(self):
-        if self.namespace is None:
+        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
@@ -4,10 +4,16 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from HTMLParser import (
-    HTMLParser,
-    HTMLParseError,
-    )
+from HTMLParser import HTMLParser
+
+try:
+    from HTMLParser import HTMLParseError
+except ImportError, e:
+    # HTMLParseError is removed in Python 3.5. Since it can never be
+    # thrown in 3.5, we can just define our own class as a placeholder.
+    class HTMLParseError(Exception):
+        pass
+
 import sys
 import warnings

@@ -19,10 +25,10 @@ import warnings
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
-    major > 3
-    or (major == 3 and minor > 2)
-    or (major == 3 and minor == 2 and release >= 3))
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+

 from bs4.element import (
    CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed.
+        # it's fixed in all supported versions.
+        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_pi(self, data):
        self.soup.endData()
-        if data.endswith("?") and data.lower().startswith("xml"):
-            # "An XHTML processing instruction using the trailing '?'
-            # will cause the '?' to be included in data." - HTMLParser
-            # docs.
-            #
-            # Strip the question mark so we don't end up with two
-            # question marks.
-            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)

@@ -128,15 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):

    is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    picklable = True
+    NAME = HTMLPARSER
+    features = [NAME, HTML, STRICT]

    def __init__(self, *args, **kwargs):
-        if CONSTRUCTOR_TAKES_STRICT:
+        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
+        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)

    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
@@ -147,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
            return

        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+                               exclude_encodings=exclude_encodings)
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)
@@ -7,7 +7,12 @@ from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+)
 from bs4.builder import (
    FAST,
    HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    is_xml = True

+    NAME = "lxml-xml"
+    ALTERNATE_NAMES = ["xml"]
+
    # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]

    CHUNK_SIZE = 512

@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            return (None, tag)

    def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
@@ -95,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # the document as each one in turn.
        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)

@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.pop()

    def pi(self, target, data):
-        pass
+        self.soup.endData()
+        self.soup.handle_data(target + ' ' + data)
+        self.soup.endData(ProcessingInstruction)

    def data(self, content):
        self.soup.handle_data(content)
@@ -212,7 +224,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):

 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = LXML
+    ALTERNATE_NAMES = ["lxml-html"]
+
+    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False

    def default_parser(self, encoding):
@@ -3,10 +3,11 @@

 This library converts a bytestream to Unicode through any means
 necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """

+from pdb import set_trace
 import codecs
 from htmlentitydefs import codepoint2name
 import re
@@ -212,8 +213,11 @@ class EncodingDetector:

    5. Windows-1252.
    """
-    def __init__(self, markup, override_encodings=None, is_html=False):
+    def __init__(self, markup, override_encodings=None, is_html=False,
+                 exclude_encodings=None):
        self.override_encodings = override_encodings or []
+        exclude_encodings = exclude_encodings or []
+        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
        self.chardet_encoding = None
        self.is_html = is_html
        self.declared_encoding = None
@@ -224,6 +228,8 @@ class EncodingDetector:
    def _usable(self, encoding, tried):
        if encoding is not None:
            encoding = encoding.lower()
+            if encoding in self.exclude_encodings:
+                return False
            if encoding not in tried:
                tried.add(encoding)
                return True
@@ -266,6 +272,9 @@ class EncodingDetector:
    def strip_byte_order_mark(cls, data):
        """If a byte-order mark is present, strip it and return the encoding it implies."""
        encoding = None
+        if isinstance(data, unicode):
+            # Unicode data cannot have a byte-order mark.
+            return data, encoding
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
@@ -299,14 +308,14 @@ class EncodingDetector:
        else:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))
-
+            
        declared_encoding = None
        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii')
+                'ascii', 'replace')
        if declared_encoding:
            return declared_encoding.lower()
        return None
@@ -331,13 +340,14 @@ class UnicodeDammit:
        ]

    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html

-        self.detector = EncodingDetector(markup, override_encodings, is_html)
+        self.detector = EncodingDetector(
+            markup, override_encodings, is_html, exclude_encodings)

        # Short-circuit if the data is in Unicode to begin with.
        if isinstance(markup, unicode) or markup == '':
@@ -33,12 +33,21 @@ def diagnose(data):

    if 'lxml' in basic_parsers:
        basic_parsers.append(["lxml", "xml"])
-        from lxml import etree
-        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        try:
+            from lxml import etree
+            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        except ImportError, e:
+            print (
+                "lxml is not installed or couldn't be imported.")
+

    if 'html5lib' in basic_parsers:
-        import html5lib
-        print "Found html5lib version %s" % html5lib.__version__
+        try:
+            import html5lib
+            print "Found html5lib version %s" % html5lib.__version__
+        except ImportError, e:
+            print (
+                "html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
@@ -135,7 +144,7 @@ def rword(length=5):
 def rsentence(length=4):
    "Generate a random sentence-like string."
    return " ".join(rword(random.randint(4,9)) for i in range(length))
-
+        
 def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
@@ -159,7 +168,7 @@ def benchmark_parsers(num_elements=100000):
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
-
+    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
@@ -1,3 +1,4 @@
+from pdb import set_trace
 import collections
 import re
 import sys
@@ -185,24 +186,40 @@ class PageElement(object):
            return self.HTML_FORMATTERS.get(
                name, HTMLAwareEntitySubstitution.substitute_xml)

-    def setup(self, parent=None, previous_element=None):
+    def setup(self, parent=None, previous_element=None, next_element=None,
+              previous_sibling=None, next_sibling=None):
        """Sets up the initial relations between this element and
        other elements."""
        self.parent = parent
+
        self.previous_element = previous_element
        if previous_element is not None:
            self.previous_element.next_element = self
-        self.next_element = None
-        self.previous_sibling = None
-        self.next_sibling = None
-        if self.parent is not None and self.parent.contents:
-            self.previous_sibling = self.parent.contents[-1]
+
+        self.next_element = next_element
+        if self.next_element:
+            self.next_element.previous_element = self
+
+        self.next_sibling = next_sibling
+        if self.next_sibling:
+            self.next_sibling.previous_sibling = self
+
+        if (not previous_sibling
+            and self.parent is not None and self.parent.contents):
+            previous_sibling = self.parent.contents[-1]
+
+        self.previous_sibling = previous_sibling
+        if previous_sibling:
            self.previous_sibling.next_sibling = self

    nextSibling = _alias("next_sibling")  # BS3
    previousSibling = _alias("previous_sibling")  # BS3

    def replace_with(self, replace_with):
+        if not self.parent:
+            raise ValueError(
+                "Cannot replace one element with another when the"
+                "element to be replaced is not part of a tree.")
        if replace_with is self:
            return
        if replace_with is self.parent:
@@ -216,6 +233,10 @@ class PageElement(object):

    def unwrap(self):
        my_parent = self.parent
+        if not self.parent:
+            raise ValueError(
+                "Cannot replace an element with its contents when that"
+                "element is not part of a tree.")
        my_index = self.parent.index(self)
        self.extract()
        for child in reversed(self.contents[:]):
@@ -240,17 +261,20 @@ class PageElement(object):
        last_child = self._last_descendant()
        next_element = last_child.next_element

-        if self.previous_element is not None:
+        if (self.previous_element is not None and
+            self.previous_element != next_element):
            self.previous_element.next_element = next_element
-        if next_element is not None:
+        if next_element is not None and next_element != self.previous_element:
            next_element.previous_element = self.previous_element
        self.previous_element = None
        last_child.next_element = None

        self.parent = None
-        if self.previous_sibling is not None:
+        if (self.previous_sibling is not None
+            and self.previous_sibling != self.next_sibling):
            self.previous_sibling.next_sibling = self.next_sibling
-        if self.next_sibling is not None:
+        if (self.next_sibling is not None
+            and self.next_sibling != self.previous_sibling):
            self.next_sibling.previous_sibling = self.previous_sibling
        self.previous_sibling = self.next_sibling = None
        return self
@@ -478,6 +502,10 @@ class PageElement(object):
    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
        "Iterates over a generator looking for things that match."

+        if text is None and 'string' in kwargs:
+            text = kwargs['string']
+            del kwargs['string']
+
        if isinstance(name, SoupStrainer):
            strainer = name
        else:
@@ -548,17 +576,17 @@ class PageElement(object):

    # Methods for supporting CSS selectors.

-    tag_name_re = re.compile('^[a-z0-9]+$')
+    tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')

-    # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
-    #   \---/  \---/\-------------/    \-------/
-    #     |      |         |               |
-    #     |      |         |           The value
-    #     |      |    ~,|,^,$,* or =
-    #     |   Attribute
+    # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
+    #   \---------------------------/  \---/\-------------/    \-------/
+    #     |                              |         |               |
+    #     |                              |         |           The value
+    #     |                              |    ~,|,^,$,* or =
+    #     |                           Attribute
    #    Tag
    attribselect_re = re.compile(
-        r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
+        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
        r'=?"?(?P<value>[^\]"]*)"?\]$'
        )

@@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement):
        how to handle non-ASCII characters.
        """
        if isinstance(value, unicode):
-            return unicode.__new__(cls, value)
-        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+            u = unicode.__new__(cls, value)
+        else:
+            u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+        u.setup()
+        return u

    def __copy__(self):
-        return self
+        """A copy of a NavigableString has the same contents and class
+        as the original, but it is not connected to the parse tree.
+        """
+        return type(self)(self)

    def __getnewargs__(self):
        return (unicode(self),)
@@ -707,7 +741,7 @@ class CData(PreformattedString):
 class ProcessingInstruction(PreformattedString):

    PREFIX = u'<?'
-    SUFFIX = u'?>'
+    SUFFIX = u'>'

 class Comment(PreformattedString):

@@ -759,9 +793,12 @@ class Tag(PageElement):
        self.prefix = prefix
        if attrs is None:
            attrs = {}
-        elif attrs and builder.cdata_list_attributes:
-            attrs = builder._replace_cdata_list_attribute_values(
-                self.name, attrs)
+        elif attrs:
+            if builder is not None and builder.cdata_list_attributes:
+                attrs = builder._replace_cdata_list_attribute_values(
+                    self.name, attrs)
+            else:
+                attrs = dict(attrs)
        else:
            attrs = dict(attrs)
        self.attrs = attrs
@@ -778,6 +815,18 @@ class Tag(PageElement):

    parserClass = _alias("parser_class")  # BS3

+    def __copy__(self):
+        """A copy of a Tag is a new Tag, unconnected to the parse tree.
+        Its contents are a copy of the old Tag's contents.
+        """
+        clone = type(self)(None, self.builder, self.name, self.namespace,
+                           self.nsprefix, self.attrs)
+        for attr in ('can_be_empty_element', 'hidden'):
+            setattr(clone, attr, getattr(self, attr))
+        for child in self.contents:
+            clone.append(child.__copy__())
+        return clone
+
    @property
    def is_empty_element(self):
        """Is this tag an empty-element tag? (aka a self-closing tag)
@@ -971,15 +1020,25 @@ class Tag(PageElement):
        as defined in __eq__."""
        return not self == other

-    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+    def __repr__(self, encoding="unicode-escape"):
        """Renders this tag as a string."""
-        return self.encode(encoding)
+        if PY3K:
+            # "The return value must be a string object", i.e. Unicode
+            return self.decode()
+        else:
+            # "The return value must be a string object", i.e. a bytestring.
+            # By convention, the return value of __repr__ should also be
+            # an ASCII string.
+            return self.encode(encoding)

    def __unicode__(self):
        return self.decode()

    def __str__(self):
-        return self.encode()
+        if PY3K:
+            return self.decode()
+        else:
+            return self.encode()

    if PY3K:
        __str__ = __repr__ = __unicode__
@@ -1103,12 +1162,18 @@ class Tag(PageElement):
                       formatter="minimal"):
        """Renders the contents of this tag as a Unicode string.

+        :param indent_level: Each line of the rendering will be
+           indented this many spaces.
+
        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
+
+        :param formatter: The output formatter responsible for converting
+           entities to Unicode characters.
        """
        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
@@ -1137,7 +1202,17 @@ class Tag(PageElement):
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
-        """Renders the contents of this tag as a bytestring."""
+        """Renders the contents of this tag as a bytestring.
+
+        :param indent_level: Each line of the rendering will be
+           indented this many spaces.
+
+        :param eventual_encoding: The bytestring will be in this encoding.
+
+        :param formatter: The output formatter responsible for converting
+           entities to Unicode characters.
+        """
+
        contents = self.decode_contents(indent_level, encoding, formatter)
        return contents.encode(encoding)

@@ -1201,63 +1276,89 @@ class Tag(PageElement):

    _selector_combinators = ['>', '+', '~']
    _select_debug = False
-    def select(self, selector, _candidate_generator=None):
+    def select_one(self, selector):
        """Perform a CSS selection operation on the current element."""
-        tokens = selector.split()
+        value = self.select(selector, limit=1)
+        if value:
+            return value[0]
+        return None
+
+    def select(self, selector, _candidate_generator=None, limit=None):
+        """Perform a CSS selection operation on the current element."""
+
+        # Remove whitespace directly after the grouping operator ','
+        # then split into tokens.
+        tokens = re.sub(',[\s]*',',', selector).split()
        current_context = [self]

        if tokens[-1] in self._selector_combinators:
            raise ValueError(
                'Final combinator "%s" is missing an argument.' % tokens[-1])
+
        if self._select_debug:
            print 'Running CSS selector "%s"' % selector
-        for index, token in enumerate(tokens):
-            if self._select_debug:
-                print ' Considering token "%s"' % token
-            recursive_candidate_generator = None
-            tag_name = None
+
+        for index, token_group in enumerate(tokens):
+            new_context = []
+            new_context_ids = set([])
+
+            # Grouping selectors, ie: p,a
+            grouped_tokens = token_group.split(',')
+            if '' in grouped_tokens:
+                raise ValueError('Invalid group selection syntax: %s' % token_group)
+
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
                    print '  Token was consumed by the previous combinator.'
                continue
-            # Each operation corresponds to a checker function, a rule
-            # for determining whether a candidate matches the
-            # selector. Candidates are generated by the active
-            # iterator.
-            checker = None

-            m = self.attribselect_re.match(token)
-            if m is not None:
-                # Attribute selector
-                tag_name, attribute, operator, value = m.groups()
-                checker = self._attribute_checker(operator, attribute, value)
+            for token in grouped_tokens:
+                if self._select_debug:
+                    print ' Considering token "%s"' % token
+                recursive_candidate_generator = None
+                tag_name = None

-            elif '#' in token:
-                # ID selector
-                tag_name, tag_id = token.split('#', 1)
-                def id_matches(tag):
-                    return tag.get('id', None) == tag_id
-                checker = id_matches
+                # Each operation corresponds to a checker function, a rule
+                # for determining whether a candidate matches the
+                # selector. Candidates are generated by the active
+                # iterator.
+                checker = None

-            elif '.' in token:
-                # Class selector
-                tag_name, klass = token.split('.', 1)
-                classes = set(klass.split('.'))
-                def classes_match(candidate):
-                    return classes.issubset(candidate.get('class', []))
-                checker = classes_match
+                m = self.attribselect_re.match(token)
+                if m is not None:
+                    # Attribute selector
+                    tag_name, attribute, operator, value = m.groups()
+                    checker = self._attribute_checker(operator, attribute, value)

-            elif ':' in token:
-                # Pseudo-class
-                tag_name, pseudo = token.split(':', 1)
-                if tag_name == '':
-                    raise ValueError(
-                        "A pseudo-class must be prefixed with a tag name.")
-                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
-                found = []
-                if pseudo_attributes is not None:
-                    pseudo_type, pseudo_value = pseudo_attributes.groups()
+                elif '#' in token:
+                    # ID selector
+                    tag_name, tag_id = token.split('#', 1)
+                    def id_matches(tag):
+                        return tag.get('id', None) == tag_id
+                    checker = id_matches
+
+                elif '.' in token:
+                    # Class selector
+                    tag_name, klass = token.split('.', 1)
+                    classes = set(klass.split('.'))
+                    def classes_match(candidate):
+                        return classes.issubset(candidate.get('class', []))
+                    checker = classes_match
+
+                elif ':' in token:
+                    # Pseudo-class
+                    tag_name, pseudo = token.split(':', 1)
+                    if tag_name == '':
+                        raise ValueError(
+                            "A pseudo-class must be prefixed with a tag name.")
+                    pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                    found = []
+                    if pseudo_attributes is None:
+                        pseudo_type = pseudo
+                        pseudo_value = None
+                    else:
+                        pseudo_type, pseudo_value = pseudo_attributes.groups()
                    if pseudo_type == 'nth-of-type':
                        try:
                            pseudo_value = int(pseudo_value)
@@ -1286,109 +1387,110 @@ class Tag(PageElement):
                        raise NotImplementedError(
                            'Only the following pseudo-classes are implemented: nth-of-type.')

-            elif token == '*':
-                # Star selector -- matches everything
-                pass
-            elif token == '>':
-                # Run the next token as a CSS selector against the
-                # direct children of each tag in the current context.
-                recursive_candidate_generator = lambda tag: tag.children
-            elif token == '~':
-                # Run the next token as a CSS selector against the
-                # siblings of each tag in the current context.
-                recursive_candidate_generator = lambda tag: tag.next_siblings
-            elif token == '+':
-                # For each tag in the current context, run the next
-                # token as a CSS selector against the tag's next
-                # sibling that's a tag.
-                def next_tag_sibling(tag):
-                    yield tag.find_next_sibling(True)
-                recursive_candidate_generator = next_tag_sibling
+                elif token == '*':
+                    # Star selector -- matches everything
+                    pass
+                elif token == '>':
+                    # Run the next token as a CSS selector against the
+                    # direct children of each tag in the current context.
+                    recursive_candidate_generator = lambda tag: tag.children
+                elif token == '~':
+                    # Run the next token as a CSS selector against the
+                    # siblings of each tag in the current context.
+                    recursive_candidate_generator = lambda tag: tag.next_siblings
+                elif token == '+':
+                    # For each tag in the current context, run the next
+                    # token as a CSS selector against the tag's next
+                    # sibling that's a tag.
+                    def next_tag_sibling(tag):
+                        yield tag.find_next_sibling(True)
+                    recursive_candidate_generator = next_tag_sibling

-            elif self.tag_name_re.match(token):
-                # Just a tag name.
-                tag_name = token
-            else:
-                raise ValueError(
-                    'Unsupported or invalid CSS selector: "%s"' % token)
-
-            if recursive_candidate_generator:
-                # This happens when the selector looks like  "> foo".
-                #
-                # The generator calls select() recursively on every
-                # member of the current context, passing in a different
-                # candidate generator and a different selector.
-                #
-                # In the case of "> foo", the candidate generator is
-                # one that yields a tag's direct children (">"), and
-                # the selector is "foo".
-                next_token = tokens[index+1]
-                def recursive_select(tag):
-                    if self._select_debug:
-                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
-                        print '-' * 40
-                    for i in tag.select(next_token, recursive_candidate_generator):
-                        if self._select_debug:
-                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
-                        yield i
-                    if self._select_debug:
-                        print '-' * 40
-                _use_candidate_generator = recursive_select
-            elif _candidate_generator is None:
-                # By default, a tag's candidates are all of its
-                # children. If tag_name is defined, only yield tags
-                # with that name.
-                if self._select_debug:
-                    if tag_name:
-                        check = "[any]"
-                    else:
-                        check = tag_name
-                    print '   Default candidate generator, tag name="%s"' % check
-                if self._select_debug:
-                    # This is redundant with later code, but it stops
-                    # a bunch of bogus tags from cluttering up the
-                    # debug log.
-                    def default_candidate_generator(tag):
-                        for child in tag.descendants:
-                            if not isinstance(child, Tag):
-                                continue
-                            if tag_name and not child.name == tag_name:
-                                continue
-                            yield child
-                    _use_candidate_generator = default_candidate_generator
+                elif self.tag_name_re.match(token):
+                    # Just a tag name.
+                    tag_name = token
                else:
-                    _use_candidate_generator = lambda tag: tag.descendants
-            else:
-                _use_candidate_generator = _candidate_generator
-
-            new_context = []
-            new_context_ids = set([])
-            for tag in current_context:
-                if self._select_debug:
-                    print "    Running candidate generator on %s %s" % (
-                        tag.name, repr(tag.attrs))
-                for candidate in _use_candidate_generator(tag):
-                    if not isinstance(candidate, Tag):
-                        continue
-                    if tag_name and candidate.name != tag_name:
-                        continue
-                    if checker is not None:
-                        try:
-                            result = checker(candidate)
-                        except StopIteration:
-                            # The checker has decided we should no longer
-                            # run the generator.
-                            break
-                    if checker is None or result:
+                    raise ValueError(
+                        'Unsupported or invalid CSS selector: "%s"' % token)
+                if recursive_candidate_generator:
+                    # This happens when the selector looks like  "> foo".
+                    #
+                    # The generator calls select() recursively on every
+                    # member of the current context, passing in a different
+                    # candidate generator and a different selector.
+                    #
+                    # In the case of "> foo", the candidate generator is
+                    # one that yields a tag's direct children (">"), and
+                    # the selector is "foo".
+                    next_token = tokens[index+1]
+                    def recursive_select(tag):
                        if self._select_debug:
-                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
-                        if id(candidate) not in new_context_ids:
-                            # If a tag matches a selector more than once,
-                            # don't include it in the context more than once.
-                            new_context.append(candidate)
-                            new_context_ids.add(id(candidate))
-                    elif self._select_debug:
-                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+                            print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
+                            print '-' * 40
+                        for i in tag.select(next_token, recursive_candidate_generator):
+                            if self._select_debug:
+                                print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+                            yield i
+                        if self._select_debug:
+                            print '-' * 40
+                    _use_candidate_generator = recursive_select
+                elif _candidate_generator is None:
+                    # By default, a tag's candidates are all of its
+                    # children. If tag_name is defined, only yield tags
+                    # with that name.
+                    if self._select_debug:
+                        if tag_name:
+                            check = "[any]"
+                        else:
+                            check = tag_name
+                        print '   Default candidate generator, tag name="%s"' % check
+                    if self._select_debug:
+                        # This is redundant with later code, but it stops
+                        # a bunch of bogus tags from cluttering up the
+                        # debug log.
+                        def default_candidate_generator(tag):
+                            for child in tag.descendants:
+                                if not isinstance(child, Tag):
+                                    continue
+                                if tag_name and not child.name == tag_name:
+                                    continue
+                                yield child
+                        _use_candidate_generator = default_candidate_generator
+                    else:
+                        _use_candidate_generator = lambda tag: tag.descendants
+                else:
+                    _use_candidate_generator = _candidate_generator
+
+                count = 0
+                for tag in current_context:
+                    if self._select_debug:
+                        print "    Running candidate generator on %s %s" % (
+                            tag.name, repr(tag.attrs))
+                    for candidate in _use_candidate_generator(tag):
+                        if not isinstance(candidate, Tag):
+                            continue
+                        if tag_name and candidate.name != tag_name:
+                            continue
+                        if checker is not None:
+                            try:
+                                result = checker(candidate)
+                            except StopIteration:
+                                # The checker has decided we should no longer
+                                # run the generator.
+                                break
+                        if checker is None or result:
+                            if self._select_debug:
+                                print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+                            if id(candidate) not in new_context_ids:
+                                # If a tag matches a selector more than once,
+                                # don't include it in the context more than once.
+                                new_context.append(candidate)
+                                new_context_ids.add(id(candidate))
+                                if limit and len(new_context) >= limit:
+                                    break
+                        elif self._select_debug:
+                            print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+

            current_context = new_context

@@ -1,5 +1,6 @@
 """Helper classes for tests."""

+import pickle
 import copy
 import functools
 import unittest
@@ -43,6 +44,16 @@ class SoupTest(unittest.TestCase):

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))

+    def assertConnectedness(self, element):
+        """Ensure that next_element and previous_element are properly
+        set for all descendants of the given element.
+        """
+        earlier = None
+        for e in element.descendants:
+            if earlier:
+                self.assertEqual(e, earlier.next_element)
+                self.assertEqual(earlier, e.previous_element)
+            earlier = e

 class HTMLTreeBuilderSmokeTest(object):

@@ -54,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object):
    markup in these tests, there's not much room for interpretation.
    """

+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
    def assertDoctypeHandled(self, doctype_fragment):
        """Assert that a given doctype string is handled correctly."""
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -114,6 +134,11 @@ class HTMLTreeBuilderSmokeTest(object):
            soup.encode("utf-8").replace(b"\n", b""),
            markup.replace(b"\n", b""))

+    def test_processing_instruction(self):
+        markup = b"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
    def test_deepcopy(self):
        """Make sure you can copy the tree builder.

@@ -155,6 +180,23 @@ class HTMLTreeBuilderSmokeTest(object):
    def test_nested_formatting_elements(self):
        self.assertSoupEquals("<em><em></em></em>")

+    def test_double_head(self):
+        html = '''<!DOCTYPE html>
+<html>
+<head>
+<title>Ordinary HEAD element test</title>
+</head>
+<script type="text/javascript">
+alert("Help!");
+</script>
+<body>
+Hello, world!
+</body>
+</html>
+'''
+        soup = self.soup(html)
+        self.assertEqual("text/javascript", soup.find('script')['type'])
+
    def test_comment(self):
        # Comments are represented as Comment objects.
        markup = "<p>foo<!--foobar-->baz</p>"
@@ -221,6 +263,14 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(["css"], soup.div.div['class'])

+    def test_multivalued_attribute_on_html(self):
+        # html5lib uses a different API to set the attributes ot the
+        # <html> tag. This has caused problems with multivalued
+        # attributes.
+        markup = '<html class="a b"></html>'
+        soup = self.soup(markup)
+        self.assertEqual(["a", "b"], soup.html['class'])
+
    def test_angle_brackets_in_attribute_values_are_escaped(self):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')

@@ -253,6 +303,35 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
        self.assertEqual("p", soup.h2.string.next_element.name)
        self.assertEqual("p", soup.p.name)
+        self.assertConnectedness(soup)
+
+    def test_head_tag_between_head_and_body(self):
+        "Prevent recurrence of a bug in the html5lib treebuilder."
+        content = """<html><head></head>
+  <link></link>
+  <body>foo</body>
+</html>
+"""
+        soup = self.soup(content)
+        self.assertNotEqual(None, soup.html.body)
+        self.assertConnectedness(soup)
+
+    def test_multiple_copies_of_a_tag(self):
+        "Prevent recurrence of a bug in the html5lib treebuilder."
+        content = """<!DOCTYPE html>
+<html>
+ <body>
+   <article id="a" >
+   <div><a href="1"></div>
+   <footer>
+     <a href="2"></a>
+   </footer>
+  </article>
+  </body>
+</html>
+"""
+        soup = self.soup(content)
+        self.assertConnectedness(soup.article)

    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
@@ -463,6 +542,15 @@ class HTMLTreeBuilderSmokeTest(object):

 class XMLTreeBuilderSmokeTest(object):

+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
    def test_docstring_generated(self):
        soup = self.soup("<root/>")
        self.assertEqual(
@@ -485,7 +573,7 @@ class XMLTreeBuilderSmokeTest(object):
  <script type="text/javascript">
  </script>
 """
-        soup = BeautifulSoup(doc, "xml")
+        soup = BeautifulSoup(doc, "lxml-xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
@@ -20,4 +20,6 @@ from .serializer import serialize

 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
-__version__ = "0.999"
+
+# this has to be at the top level, see how setup.py parses this
+__version__ = "0.999999"
@@ -1,292 +1,290 @@
 from __future__ import absolute_import, division, unicode_literals

 import string
-import gettext
-_ = gettext.gettext

 EOF = None

 E = {
    "null-character":
-        _("Null character in input stream, replaced with U+FFFD."),
+        "Null character in input stream, replaced with U+FFFD.",
    "invalid-codepoint":
-        _("Invalid codepoint in stream."),
+        "Invalid codepoint in stream.",
    "incorrectly-placed-solidus":
-        _("Solidus (/) incorrectly placed in tag."),
+        "Solidus (/) incorrectly placed in tag.",
    "incorrect-cr-newline-entity":
-        _("Incorrect CR newline entity, replaced with LF."),
+        "Incorrect CR newline entity, replaced with LF.",
    "illegal-windows-1252-entity":
-        _("Entity used with illegal number (windows-1252 reference)."),
+        "Entity used with illegal number (windows-1252 reference).",
    "cant-convert-numeric-entity":
-        _("Numeric entity couldn't be converted to character "
-          "(codepoint U+%(charAsInt)08x)."),
+        "Numeric entity couldn't be converted to character "
+        "(codepoint U+%(charAsInt)08x).",
    "illegal-codepoint-for-numeric-entity":
-        _("Numeric entity represents an illegal codepoint: "
-          "U+%(charAsInt)08x."),
+        "Numeric entity represents an illegal codepoint: "
+        "U+%(charAsInt)08x.",
    "numeric-entity-without-semicolon":
-        _("Numeric entity didn't end with ';'."),
+        "Numeric entity didn't end with ';'.",
    "expected-numeric-entity-but-got-eof":
-        _("Numeric entity expected. Got end of file instead."),
+        "Numeric entity expected. Got end of file instead.",
    "expected-numeric-entity":
-        _("Numeric entity expected but none found."),
+        "Numeric entity expected but none found.",
    "named-entity-without-semicolon":
-        _("Named entity didn't end with ';'."),
+        "Named entity didn't end with ';'.",
    "expected-named-entity":
-        _("Named entity expected. Got none."),
+        "Named entity expected. Got none.",
    "attributes-in-end-tag":
-        _("End tag contains unexpected attributes."),
+        "End tag contains unexpected attributes.",
    'self-closing-flag-on-end-tag':
-        _("End tag contains unexpected self-closing flag."),
+        "End tag contains unexpected self-closing flag.",
    "expected-tag-name-but-got-right-bracket":
-        _("Expected tag name. Got '>' instead."),
+        "Expected tag name. Got '>' instead.",
    "expected-tag-name-but-got-question-mark":
-        _("Expected tag name. Got '?' instead. (HTML doesn't "
-          "support processing instructions.)"),
+        "Expected tag name. Got '?' instead. (HTML doesn't "
+        "support processing instructions.)",
    "expected-tag-name":
-        _("Expected tag name. Got something else instead"),
+        "Expected tag name. Got something else instead",
    "expected-closing-tag-but-got-right-bracket":
-        _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
+        "Expected closing tag. Got '>' instead. Ignoring '</>'.",
    "expected-closing-tag-but-got-eof":
-        _("Expected closing tag. Unexpected end of file."),
+        "Expected closing tag. Unexpected end of file.",
    "expected-closing-tag-but-got-char":
-        _("Expected closing tag. Unexpected character '%(data)s' found."),
+        "Expected closing tag. Unexpected character '%(data)s' found.",
    "eof-in-tag-name":
-        _("Unexpected end of file in the tag name."),
+        "Unexpected end of file in the tag name.",
    "expected-attribute-name-but-got-eof":
-        _("Unexpected end of file. Expected attribute name instead."),
+        "Unexpected end of file. Expected attribute name instead.",
    "eof-in-attribute-name":
-        _("Unexpected end of file in attribute name."),
+        "Unexpected end of file in attribute name.",
    "invalid-character-in-attribute-name":
-        _("Invalid character in attribute name"),
+        "Invalid character in attribute name",
    "duplicate-attribute":
-        _("Dropped duplicate attribute on tag."),
+        "Dropped duplicate attribute on tag.",
    "expected-end-of-tag-name-but-got-eof":
-        _("Unexpected end of file. Expected = or end of tag."),
+        "Unexpected end of file. Expected = or end of tag.",
    "expected-attribute-value-but-got-eof":
-        _("Unexpected end of file. Expected attribute value."),
+        "Unexpected end of file. Expected attribute value.",
    "expected-attribute-value-but-got-right-bracket":
-        _("Expected attribute value. Got '>' instead."),
+        "Expected attribute value. Got '>' instead.",
    'equals-in-unquoted-attribute-value':
-        _("Unexpected = in unquoted attribute"),
+        "Unexpected = in unquoted attribute",
    'unexpected-character-in-unquoted-attribute-value':
-        _("Unexpected character in unquoted attribute"),
+        "Unexpected character in unquoted attribute",
    "invalid-character-after-attribute-name":
-        _("Unexpected character after attribute name."),
+        "Unexpected character after attribute name.",
    "unexpected-character-after-attribute-value":
-        _("Unexpected character after attribute value."),
+        "Unexpected character after attribute value.",
    "eof-in-attribute-value-double-quote":
-        _("Unexpected end of file in attribute value (\")."),
+        "Unexpected end of file in attribute value (\").",
    "eof-in-attribute-value-single-quote":
-        _("Unexpected end of file in attribute value (')."),
+        "Unexpected end of file in attribute value (').",
    "eof-in-attribute-value-no-quotes":
-        _("Unexpected end of file in attribute value."),
+        "Unexpected end of file in attribute value.",
    "unexpected-EOF-after-solidus-in-tag":
-        _("Unexpected end of file in tag. Expected >"),
+        "Unexpected end of file in tag. Expected >",
    "unexpected-character-after-solidus-in-tag":
-        _("Unexpected character after / in tag. Expected >"),
+        "Unexpected character after / in tag. Expected >",
    "expected-dashes-or-doctype":
-        _("Expected '--' or 'DOCTYPE'. Not found."),
+        "Expected '--' or 'DOCTYPE'. Not found.",
    "unexpected-bang-after-double-dash-in-comment":
-        _("Unexpected ! after -- in comment"),
+        "Unexpected ! after -- in comment",
    "unexpected-space-after-double-dash-in-comment":
-        _("Unexpected space after -- in comment"),
+        "Unexpected space after -- in comment",
    "incorrect-comment":
-        _("Incorrect comment."),
+        "Incorrect comment.",
    "eof-in-comment":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-dash":
-        _("Unexpected end of file in comment (-)"),
+        "Unexpected end of file in comment (-)",
    "unexpected-dash-after-double-dash-in-comment":
-        _("Unexpected '-' after '--' found in comment."),
+        "Unexpected '-' after '--' found in comment.",
    "eof-in-comment-double-dash":
-        _("Unexpected end of file in comment (--)."),
+        "Unexpected end of file in comment (--).",
    "eof-in-comment-end-space-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-bang-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "unexpected-char-in-comment":
-        _("Unexpected character in comment found."),
+        "Unexpected character in comment found.",
    "need-space-after-doctype":
-        _("No space after literal string 'DOCTYPE'."),
+        "No space after literal string 'DOCTYPE'.",
    "expected-doctype-name-but-got-right-bracket":
-        _("Unexpected > character. Expected DOCTYPE name."),
+        "Unexpected > character. Expected DOCTYPE name.",
    "expected-doctype-name-but-got-eof":
-        _("Unexpected end of file. Expected DOCTYPE name."),
+        "Unexpected end of file. Expected DOCTYPE name.",
    "eof-in-doctype-name":
-        _("Unexpected end of file in DOCTYPE name."),
+        "Unexpected end of file in DOCTYPE name.",
    "eof-in-doctype":
-        _("Unexpected end of file in DOCTYPE."),
+        "Unexpected end of file in DOCTYPE.",
    "expected-space-or-right-bracket-in-doctype":
-        _("Expected space or '>'. Got '%(data)s'"),
+        "Expected space or '>'. Got '%(data)s'",
    "unexpected-end-of-doctype":
-        _("Unexpected end of DOCTYPE."),
+        "Unexpected end of DOCTYPE.",
    "unexpected-char-in-doctype":
-        _("Unexpected character in DOCTYPE."),
+        "Unexpected character in DOCTYPE.",
    "eof-in-innerhtml":
-        _("XXX innerHTML EOF"),
+        "XXX innerHTML EOF",
    "unexpected-doctype":
-        _("Unexpected DOCTYPE. Ignored."),
+        "Unexpected DOCTYPE. Ignored.",
    "non-html-root":
-        _("html needs to be the first start tag."),
+        "html needs to be the first start tag.",
    "expected-doctype-but-got-eof":
-        _("Unexpected End of file. Expected DOCTYPE."),
+        "Unexpected End of file. Expected DOCTYPE.",
    "unknown-doctype":
-        _("Erroneous DOCTYPE."),
+        "Erroneous DOCTYPE.",
    "expected-doctype-but-got-chars":
-        _("Unexpected non-space characters. Expected DOCTYPE."),
+        "Unexpected non-space characters. Expected DOCTYPE.",
    "expected-doctype-but-got-start-tag":
-        _("Unexpected start tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected start tag (%(name)s). Expected DOCTYPE.",
    "expected-doctype-but-got-end-tag":
-        _("Unexpected end tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected end tag (%(name)s). Expected DOCTYPE.",
    "end-tag-after-implied-root":
-        _("Unexpected end tag (%(name)s) after the (implied) root element."),
+        "Unexpected end tag (%(name)s) after the (implied) root element.",
    "expected-named-closing-tag-but-got-eof":
-        _("Unexpected end of file. Expected end tag (%(name)s)."),
+        "Unexpected end of file. Expected end tag (%(name)s).",
    "two-heads-are-not-better-than-one":
-        _("Unexpected start tag head in existing head. Ignored."),
+        "Unexpected start tag head in existing head. Ignored.",
    "unexpected-end-tag":
-        _("Unexpected end tag (%(name)s). Ignored."),
+        "Unexpected end tag (%(name)s). Ignored.",
    "unexpected-start-tag-out-of-my-head":
-        _("Unexpected start tag (%(name)s) that can be in head. Moved."),
+        "Unexpected start tag (%(name)s) that can be in head. Moved.",
    "unexpected-start-tag":
-        _("Unexpected start tag (%(name)s)."),
+        "Unexpected start tag (%(name)s).",
    "missing-end-tag":
-        _("Missing end tag (%(name)s)."),
+        "Missing end tag (%(name)s).",
    "missing-end-tags":
-        _("Missing end tags (%(name)s)."),
+        "Missing end tags (%(name)s).",
    "unexpected-start-tag-implies-end-tag":
-        _("Unexpected start tag (%(startName)s) "
-          "implies end tag (%(endName)s)."),
+        "Unexpected start tag (%(startName)s) "
+        "implies end tag (%(endName)s).",
    "unexpected-start-tag-treated-as":
-        _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
    "deprecated-tag":
-        _("Unexpected start tag %(name)s. Don't use it!"),
+        "Unexpected start tag %(name)s. Don't use it!",
    "unexpected-start-tag-ignored":
-        _("Unexpected start tag %(name)s. Ignored."),
+        "Unexpected start tag %(name)s. Ignored.",
    "expected-one-end-tag-but-got-another":
-        _("Unexpected end tag (%(gotName)s). "
-          "Missing end tag (%(expectedName)s)."),
+        "Unexpected end tag (%(gotName)s). "
+        "Missing end tag (%(expectedName)s).",
    "end-tag-too-early":
-        _("End tag (%(name)s) seen too early. Expected other end tag."),
+        "End tag (%(name)s) seen too early. Expected other end tag.",
    "end-tag-too-early-named":
-        _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+        "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
    "end-tag-too-early-ignored":
-        _("End tag (%(name)s) seen too early. Ignored."),
+        "End tag (%(name)s) seen too early. Ignored.",
    "adoption-agency-1.1":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 1 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 1 of the adoption agency algorithm.",
    "adoption-agency-1.2":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 2 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 2 of the adoption agency algorithm.",
    "adoption-agency-1.3":
-        _("End tag (%(name)s) violates step 1, "
-          "paragraph 3 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 3 of the adoption agency algorithm.",
    "adoption-agency-4.4":
-        _("End tag (%(name)s) violates step 4, "
-          "paragraph 4 of the adoption agency algorithm."),
+        "End tag (%(name)s) violates step 4, "
+        "paragraph 4 of the adoption agency algorithm.",
    "unexpected-end-tag-treated-as":
-        _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
    "no-end-tag":
-        _("This element (%(name)s) has no end tag."),
+        "This element (%(name)s) has no end tag.",
    "unexpected-implied-end-tag-in-table":
-        _("Unexpected implied end tag (%(name)s) in the table phase."),
+        "Unexpected implied end tag (%(name)s) in the table phase.",
    "unexpected-implied-end-tag-in-table-body":
-        _("Unexpected implied end tag (%(name)s) in the table body phase."),
+        "Unexpected implied end tag (%(name)s) in the table body phase.",
    "unexpected-char-implies-table-voodoo":
-        _("Unexpected non-space characters in "
-          "table context caused voodoo mode."),
+        "Unexpected non-space characters in "
+        "table context caused voodoo mode.",
    "unexpected-hidden-input-in-table":
-        _("Unexpected input with type hidden in table context."),
+        "Unexpected input with type hidden in table context.",
    "unexpected-form-in-table":
-        _("Unexpected form in table context."),
+        "Unexpected form in table context.",
    "unexpected-start-tag-implies-table-voodoo":
-        _("Unexpected start tag (%(name)s) in "
-          "table context caused voodoo mode."),
+        "Unexpected start tag (%(name)s) in "
+        "table context caused voodoo mode.",
    "unexpected-end-tag-implies-table-voodoo":
-        _("Unexpected end tag (%(name)s) in "
-          "table context caused voodoo mode."),
+        "Unexpected end tag (%(name)s) in "
+        "table context caused voodoo mode.",
    "unexpected-cell-in-table-body":
-        _("Unexpected table cell start tag (%(name)s) "
-          "in the table body phase."),
+        "Unexpected table cell start tag (%(name)s) "
+        "in the table body phase.",
    "unexpected-cell-end-tag":
-        _("Got table cell end tag (%(name)s) "
-          "while required end tags are missing."),
+        "Got table cell end tag (%(name)s) "
+        "while required end tags are missing.",
    "unexpected-end-tag-in-table-body":
-        _("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
    "unexpected-implied-end-tag-in-table-row":
-        _("Unexpected implied end tag (%(name)s) in the table row phase."),
+        "Unexpected implied end tag (%(name)s) in the table row phase.",
    "unexpected-end-tag-in-table-row":
-        _("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
    "unexpected-select-in-select":
-        _("Unexpected select start tag in the select phase "
-          "treated as select end tag."),
+        "Unexpected select start tag in the select phase "
+        "treated as select end tag.",
    "unexpected-input-in-select":
-        _("Unexpected input start tag in the select phase."),
+        "Unexpected input start tag in the select phase.",
    "unexpected-start-tag-in-select":
-        _("Unexpected start tag token (%(name)s in the select phase. "
-          "Ignored."),
+        "Unexpected start tag token (%(name)s in the select phase. "
+        "Ignored.",
    "unexpected-end-tag-in-select":
-        _("Unexpected end tag (%(name)s) in the select phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the select phase. Ignored.",
    "unexpected-table-element-start-tag-in-select-in-table":
-        _("Unexpected table element start tag (%(name)s) in the select in table phase."),
+        "Unexpected table element start tag (%(name)s) in the select in table phase.",
    "unexpected-table-element-end-tag-in-select-in-table":
-        _("Unexpected table element end tag (%(name)s) in the select in table phase."),
+        "Unexpected table element end tag (%(name)s) in the select in table phase.",
    "unexpected-char-after-body":
-        _("Unexpected non-space characters in the after body phase."),
+        "Unexpected non-space characters in the after body phase.",
    "unexpected-start-tag-after-body":
-        _("Unexpected start tag token (%(name)s)"
-          " in the after body phase."),
+        "Unexpected start tag token (%(name)s)"
+        " in the after body phase.",
    "unexpected-end-tag-after-body":
-        _("Unexpected end tag token (%(name)s)"
-          " in the after body phase."),
+        "Unexpected end tag token (%(name)s)"
+        " in the after body phase.",
    "unexpected-char-in-frameset":
-        _("Unexpected characters in the frameset phase. Characters ignored."),
+        "Unexpected characters in the frameset phase. Characters ignored.",
    "unexpected-start-tag-in-frameset":
-        _("Unexpected start tag token (%(name)s)"
-          " in the frameset phase. Ignored."),
+        "Unexpected start tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
    "unexpected-frameset-in-frameset-innerhtml":
-        _("Unexpected end tag token (frameset) "
-          "in the frameset phase (innerHTML)."),
+        "Unexpected end tag token (frameset) "
+        "in the frameset phase (innerHTML).",
    "unexpected-end-tag-in-frameset":
-        _("Unexpected end tag token (%(name)s)"
-          " in the frameset phase. Ignored."),
+        "Unexpected end tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
    "unexpected-char-after-frameset":
-        _("Unexpected non-space characters in the "
-          "after frameset phase. Ignored."),
+        "Unexpected non-space characters in the "
+        "after frameset phase. Ignored.",
    "unexpected-start-tag-after-frameset":
-        _("Unexpected start tag (%(name)s)"
-          " in the after frameset phase. Ignored."),
+        "Unexpected start tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
    "unexpected-end-tag-after-frameset":
-        _("Unexpected end tag (%(name)s)"
-          " in the after frameset phase. Ignored."),
+        "Unexpected end tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
    "unexpected-end-tag-after-body-innerhtml":
-        _("Unexpected end tag after body(innerHtml)"),
+        "Unexpected end tag after body(innerHtml)",
    "expected-eof-but-got-char":
-        _("Unexpected non-space characters. Expected end of file."),
+        "Unexpected non-space characters. Expected end of file.",
    "expected-eof-but-got-start-tag":
-        _("Unexpected start tag (%(name)s)"
-          ". Expected end of file."),
+        "Unexpected start tag (%(name)s)"
+        ". Expected end of file.",
    "expected-eof-but-got-end-tag":
-        _("Unexpected end tag (%(name)s)"
-          ". Expected end of file."),
+        "Unexpected end tag (%(name)s)"
+        ". Expected end of file.",
    "eof-in-table":
-        _("Unexpected end of file. Expected table content."),
+        "Unexpected end of file. Expected table content.",
    "eof-in-select":
-        _("Unexpected end of file. Expected select content."),
+        "Unexpected end of file. Expected select content.",
    "eof-in-frameset":
-        _("Unexpected end of file. Expected frameset content."),
+        "Unexpected end of file. Expected frameset content.",
    "eof-in-script-in-script":
-        _("Unexpected end of file. Expected script content."),
+        "Unexpected end of file. Expected script content.",
    "eof-in-foreign-lands":
-        _("Unexpected end of file. Expected foreign content"),
+        "Unexpected end of file. Expected foreign content",
    "non-void-element-with-trailing-solidus":
-        _("Trailing solidus not allowed on element %(name)s"),
+        "Trailing solidus not allowed on element %(name)s",
    "unexpected-html-element-in-foreign-content":
-        _("Element %(name)s not allowed in a non-html context"),
+        "Element %(name)s not allowed in a non-html context",
    "unexpected-end-tag-before-html":
-        _("Unexpected end tag (%(name)s) before html."),
+        "Unexpected end tag (%(name)s) before html.",
    "XXX-undefined-error":
-        _("Undefined error (this sucks and should be fixed)"),
+        "Undefined error (this sucks and should be fixed)",
 }

 namespaces = {
@@ -298,7 +296,7 @@ namespaces = {
    "xmlns": "http://www.w3.org/2000/xmlns/"
 }

-scopingElements = frozenset((
+scopingElements = frozenset([
    (namespaces["html"], "applet"),
    (namespaces["html"], "caption"),
    (namespaces["html"], "html"),
@@ -316,9 +314,9 @@ scopingElements = frozenset((
    (namespaces["svg"], "foreignObject"),
    (namespaces["svg"], "desc"),
    (namespaces["svg"], "title"),
-))
+])

-formattingElements = frozenset((
+formattingElements = frozenset([
    (namespaces["html"], "a"),
    (namespaces["html"], "b"),
    (namespaces["html"], "big"),
@@ -333,9 +331,9 @@ formattingElements = frozenset((
    (namespaces["html"], "strong"),
    (namespaces["html"], "tt"),
    (namespaces["html"], "u")
-))
+])

-specialElements = frozenset((
+specialElements = frozenset([
    (namespaces["html"], "address"),
    (namespaces["html"], "applet"),
    (namespaces["html"], "area"),
@@ -416,22 +414,22 @@ specialElements = frozenset((
    (namespaces["html"], "wbr"),
    (namespaces["html"], "xmp"),
    (namespaces["svg"], "foreignObject")
-))
+])

-htmlIntegrationPointElements = frozenset((
+htmlIntegrationPointElements = frozenset([
    (namespaces["mathml"], "annotaion-xml"),
    (namespaces["svg"], "foreignObject"),
    (namespaces["svg"], "desc"),
    (namespaces["svg"], "title")
-))
+])

-mathmlTextIntegrationPointElements = frozenset((
+mathmlTextIntegrationPointElements = frozenset([
    (namespaces["mathml"], "mi"),
    (namespaces["mathml"], "mo"),
    (namespaces["mathml"], "mn"),
    (namespaces["mathml"], "ms"),
    (namespaces["mathml"], "mtext")
-))
+])

 adjustForeignAttributes = {
    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
@@ -451,21 +449,21 @@ adjustForeignAttributes = {
 unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
                                  adjustForeignAttributes.items()])

-spaceCharacters = frozenset((
+spaceCharacters = frozenset([
    "\t",
    "\n",
    "\u000C",
    " ",
    "\r"
-))
+])

-tableInsertModeElements = frozenset((
+tableInsertModeElements = frozenset([
    "table",
    "tbody",
    "tfoot",
    "thead",
    "tr"
-))
+])

 asciiLowercase = frozenset(string.ascii_lowercase)
 asciiUppercase = frozenset(string.ascii_uppercase)
@@ -486,7 +484,7 @@ headingElements = (
    "h6"
 )

-voidElements = frozenset((
+voidElements = frozenset([
    "base",
    "command",
    "event-source",
@@ -502,11 +500,11 @@ voidElements = frozenset((
    "input",
    "source",
    "track"
-))
+])

-cdataElements = frozenset(('title', 'textarea'))
+cdataElements = frozenset(['title', 'textarea'])

-rcdataElements = frozenset((
+rcdataElements = frozenset([
    'style',
    'script',
    'xmp',
@@ -514,27 +512,27 @@ rcdataElements = frozenset((
    'noembed',
    'noframes',
    'noscript'
-))
+])

 booleanAttributes = {
-    "": frozenset(("irrelevant",)),
-    "style": frozenset(("scoped",)),
-    "img": frozenset(("ismap",)),
-    "audio": frozenset(("autoplay", "controls")),
-    "video": frozenset(("autoplay", "controls")),
-    "script": frozenset(("defer", "async")),
-    "details": frozenset(("open",)),
-    "datagrid": frozenset(("multiple", "disabled")),
-    "command": frozenset(("hidden", "disabled", "checked", "default")),
-    "hr": frozenset(("noshade")),
-    "menu": frozenset(("autosubmit",)),
-    "fieldset": frozenset(("disabled", "readonly")),
-    "option": frozenset(("disabled", "readonly", "selected")),
-    "optgroup": frozenset(("disabled", "readonly")),
-    "button": frozenset(("disabled", "autofocus")),
-    "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
-    "select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
-    "output": frozenset(("disabled", "readonly")),
+    "": frozenset(["irrelevant"]),
+    "style": frozenset(["scoped"]),
+    "img": frozenset(["ismap"]),
+    "audio": frozenset(["autoplay", "controls"]),
+    "video": frozenset(["autoplay", "controls"]),
+    "script": frozenset(["defer", "async"]),
+    "details": frozenset(["open"]),
+    "datagrid": frozenset(["multiple", "disabled"]),
+    "command": frozenset(["hidden", "disabled", "checked", "default"]),
+    "hr": frozenset(["noshade"]),
+    "menu": frozenset(["autosubmit"]),
+    "fieldset": frozenset(["disabled", "readonly"]),
+    "option": frozenset(["disabled", "readonly", "selected"]),
+    "optgroup": frozenset(["disabled", "readonly"]),
+    "button": frozenset(["disabled", "autofocus"]),
+    "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
+    "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
+    "output": frozenset(["disabled", "readonly"]),
 }

 # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@@ -574,7 +572,7 @@ entitiesWindows1252 = (
    376     # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
 )

-xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])

 entities = {
    "AElig": "\xc6",
@@ -3088,8 +3086,8 @@ tokenTypes = {
    "ParseError": 7
 }

-tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
-                           tokenTypes["EmptyTag"]))
+tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
+                           tokenTypes["EmptyTag"]])


 prefixes = dict([(v, k) for k, v in namespaces.items()])
@@ -1,8 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals

-from gettext import gettext
-_ = gettext
-
 from . import _base
 from ..constants import cdataElements, rcdataElements, voidElements

@@ -23,24 +20,24 @@ class Filter(_base.Filter):
            if type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
+                    raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                if not name:
-                    raise LintError(_("Empty tag name"))
+                    raise LintError("Empty tag name")
                if type == "StartTag" and name in voidElements:
-                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
+                    raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
+                    raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
                if type == "StartTag":
                    open_elements.append(name)
                for name, value in token["data"]:
                    if not isinstance(name, str):
-                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
+                        raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
                    if not name:
-                        raise LintError(_("Empty attribute name"))
+                        raise LintError("Empty attribute name")
                    if not isinstance(value, str):
-                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
+                        raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
                if name in cdataElements:
                    contentModelFlag = "CDATA"
                elif name in rcdataElements:
@@ -51,43 +48,43 @@ class Filter(_base.Filter):
            elif type == "EndTag":
                name = token["name"]
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                if not name:
-                    raise LintError(_("Empty tag name"))
+                    raise LintError("Empty tag name")
                if name in voidElements:
-                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
+                    raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
                start_name = open_elements.pop()
                if start_name != name:
-                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
+                    raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
                contentModelFlag = "PCDATA"

            elif type == "Comment":
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Comment not in PCDATA content model flag"))
+                    raise LintError("Comment not in PCDATA content model flag")

            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
                if not isinstance(data, str):
-                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
+                    raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
                if not data:
-                    raise LintError(_("%(type)s token with empty data") % {"type": type})
+                    raise LintError("%(type)s token with empty data" % {"type": type})
                if type == "SpaceCharacters":
                    data = data.strip(spaceCharacters)
                    if data:
-                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
+                        raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})

            elif type == "Doctype":
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
+                    raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                # XXX: what to do with token["data"] ?

            elif type in ("ParseError", "SerializeError"):
                pass

            else:
-                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
+                raise LintError("Unknown token type: %(type)s" % {"type": type})

            yield token
@@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements
 from .constants import tokenTypes, ReparseException, namespaces
 from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
 from .constants import adjustForeignAttributes as adjustForeignAttributesMap
+from .constants import E


 def parse(doc, treebuilder="etree", encoding=None,
@@ -129,6 +130,17 @@ class HTMLParser(object):

        self.framesetOK = True

+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.charEncoding[0]
+
    def isHTMLIntegrationPoint(self, element):
        if (element.name == "annotation-xml" and
                element.namespace == namespaces["mathml"]):
@@ -245,7 +257,7 @@ class HTMLParser(object):
        # XXX The idea is to make errorcode mandatory.
        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
        if self.strict:
-            raise ParseError
+            raise ParseError(E[errorcode] % datavars)

    def normalizeToken(self, token):
        """ HTML5 specific normalizations to the token stream """
@@ -868,7 +880,7 @@ def getPhases(debug):
            self.startTagHandler = utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("base", "basefont", "bgsound", "command", "link", "meta",
-                  "noframes", "script", "style", "title"),
+                  "script", "style", "title"),
                 self.startTagProcessInHead),
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
@@ -1205,8 +1217,7 @@ def getPhases(debug):
            attributes["name"] = "isindex"
            self.processStartTag(impliedTagToken("input", "StartTag",
                                                 attributes=attributes,
-                                                 selfClosing=
-                                                 token["selfClosing"]))
+                                                 selfClosing=token["selfClosing"]))
            self.processEndTag(impliedTagToken("label"))
            self.processStartTag(impliedTagToken("hr", "StartTag"))
            self.processEndTag(impliedTagToken("form"))
@@ -28,7 +28,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
+                                    eval('"\\uD800-\\uDFFF"'))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)

 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ class HTMLUnicodeInputStream(object):

        """

-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+            self.replaceCharactersRegexp = None
+        elif len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile(
+                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))

        # List of where new lines occur
        self.newLines = [0]
@@ -265,11 +281,12 @@ class HTMLUnicodeInputStream(object):
                self._bufferedCharacter = data[-1]
                data = data[:-1]

-        self.reportCharacterErrors(data)
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)

-        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+            # Replace invalid characters
+            # Note U+0000 is dealt with in the tokenizer
+            data = self.replaceCharactersRegexp.sub("\ufffd", data)

        data = data.replace("\r\n", "\n")
        data = data.replace("\r", "\n")
@@ -2,11 +2,26 @@ from __future__ import absolute_import, division, unicode_literals

 import re
 from xml.sax.saxutils import escape, unescape
+from six.moves import urllib_parse as urlparse

 from .tokenizer import HTMLTokenizer
 from .constants import tokenTypes


+content_type_rgx = re.compile(r'''
+                               ^
+                               # Match a content type <application>/<type>
+                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                               # Match any character set and encoding
+                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                               # Assume the rest is data
+                               ,.*
+                               $
+                               ''',
+                              re.VERBOSE)
+
+
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""

@@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object):
                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
                      'y1', 'y2', 'zoomAndPan']

-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
-                       'xlink:href', 'xml:base']
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
+                       'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']

    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-                            'ssh', 'sftp', 'rtsp', 'afs']
+                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
+
+    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']

    # subclasses may define their own versions of these constants
    allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
    allowed_css_keywords = acceptable_css_keywords
    allowed_svg_properties = acceptable_svg_properties
    allowed_protocols = acceptable_protocols
+    allowed_content_types = acceptable_content_types

    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ class HTMLSanitizerMixin(object):
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
-                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
-                    (val_unescaped.split(':')[0] not in
-                     self.allowed_protocols)):
-                    del attrs[attr]
+                uri = urlparse.urlparse(val_unescaped)
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = content_type_rgx.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
@@ -245,7 +270,7 @@ class HTMLSanitizerMixin(object):
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
                                                'padding']:
                for keyword in value.split():
-                    if not keyword in self.acceptable_css_keywords and \
+                    if keyword not in self.acceptable_css_keywords and \
                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
                        break
                else:
@@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

-import gettext
-_ = gettext.gettext
-
 try:
    from functools import reduce
 except ImportError:
@@ -35,7 +32,7 @@ else:
                v = utils.surrogatePairToCodepoint(v)
            else:
                v = ord(v)
-            if not v in encode_entity_map or k.islower():
+            if v not in encode_entity_map or k.islower():
                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
                encode_entity_map[v] = k

@@ -208,7 +205,7 @@ class HTMLSerializer(object):
                if token["systemId"]:
                    if token["systemId"].find('"') >= 0:
                        if token["systemId"].find("'") >= 0:
-                            self.serializeError(_("System identifer contains both single and double quote characters"))
+                            self.serializeError("System identifer contains both single and double quote characters")
                        quote_char = "'"
                    else:
                        quote_char = '"'
@@ -220,7 +217,7 @@ class HTMLSerializer(object):
            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
-                        self.serializeError(_("Unexpected </ in CDATA"))
+                        self.serializeError("Unexpected </ in CDATA")
                    yield self.encode(token["data"])
                else:
                    yield self.encode(escape(token["data"]))
@@ -231,7 +228,7 @@ class HTMLSerializer(object):
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                    self.serializeError("Unexpected child element of a CDATA element")
                for (attr_namespace, attr_name), attr_value in token["data"].items():
                    # TODO: Add namespace support here
                    k = attr_name
@@ -279,20 +276,20 @@ class HTMLSerializer(object):
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                    self.serializeError("Unexpected child element of a CDATA element")
                yield self.encodeStrict("</%s>" % name)

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
-                    self.serializeError(_("Comment contains --"))
+                    self.serializeError("Comment contains --")
                yield self.encodeStrict("<!--%s-->" % token["data"])

            elif type == "Entity":
                name = token["name"]
                key = name + ";"
-                if not key in entities:
-                    self.serializeError(_("Entity %s not recognized" % name))
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
@@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation):
            else:
                # HACK: allow text nodes as children of the document node
                if hasattr(self.dom, '_child_node_types'):
-                    if not Node.TEXT_NODE in self.dom._child_node_types:
+                    if Node.TEXT_NODE not in self.dom._child_node_types:
                        self.dom._child_node_types = list(self.dom._child_node_types)
                        self.dom._child_node_types.append(Node.TEXT_NODE)
                self.dom.appendChild(self.dom.createTextNode(data))
@@ -10,8 +10,12 @@ returning an iterator generating tokens.

 from __future__ import absolute_import, division, unicode_literals

+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
+           "pulldom"]
+
 import sys

+from .. import constants
 from ..utils import default_etree

 treeWalkerCache = {}
@@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers"""
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
@@ -1,8 +1,8 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type, string_types

-import gettext
-_ = gettext.gettext
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]

 from xml.dom import Node

@@ -58,7 +58,7 @@ class TreeWalker(object):
               "namespace": to_text(namespace),
               "data": attrs}
        if hasChildren:
-            yield self.error(_("Void element has children"))
+            yield self.error("Void element has children")

    def startTag(self, namespace, name, attrs):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
@@ -122,7 +122,7 @@ class TreeWalker(object):
        return {"type": "Entity", "name": text_type(name)}

    def unknown(self, nodeType):
-        return self.error(_("Unknown node type: ") + nodeType)
+        return self.error("Unknown node type: " + nodeType)


 class NonRecursiveTreeWalker(TreeWalker):
@@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals

 from xml.dom import Node

-import gettext
-_ = gettext.gettext
-
 from . import _base


@@ -7,12 +7,10 @@ except ImportError:
        from ordereddict import OrderedDict
    except ImportError:
        OrderedDict = dict
-import gettext
-_ = gettext.gettext

 import re

-from six import text_type
+from six import string_types

 from . import _base
 from ..utils import moduleFactoryFactory
@@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation):
                return _base.COMMENT, node.text

            else:
-                assert type(node.tag) == text_type, type(node.tag)
+                assert isinstance(node.tag, string_types), type(node.tag)
                # This is assumed to be an ordinary element
                match = tag_regexp.match(node.tag)
                if match:
@@ -4,9 +4,6 @@ from six import text_type
 from lxml import etree
 from ..treebuilders.etree import tag_regexp

-from gettext import gettext
-_ = gettext
-
 from . import _base

 from .. import ihatexml
@@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return _base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
@@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs, len(node) > 0 or node.text)

    def getFirstChild(self, node):
-        assert not isinstance(node, tuple), _("Text nodes have no children")
+        assert not isinstance(node, tuple), "Text nodes have no children"

        assert len(node) or node.text, "Node has no children"
        if node.text:
@@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getNextSibling(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
                # because node[0] might evaluate to False if it has no child element
@@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getParentNode(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                return node
            # else: fallback to "normal" processing
@@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals

 from types import ModuleType

+from six import text_type
+
 try:
    import xml.etree.cElementTree as default_etree
 except ImportError:
@@ -9,7 +11,26 @@ except ImportError:


 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')
+        assert isinstance(_x, text_type)
+except:
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True


 class MethodDispatcher(dict):