From 961773238aec4f3aed9b6302f67afca7b6a699df Mon Sep 17 00:00:00 2001 From: Ade Date: Thu, 23 Aug 2012 12:48:17 +1200 Subject: [PATCH 1/6] rutracker search provider Added rutracker as a torrent search provider. BeautifulSoup 4 needs to be installed to parse the results. --- data/interfaces/default/config.html | 14 + data/interfaces/default/history.html | 2 + headphones/__init__.py | 15 +- headphones/searcher.py | 85 +- headphones/searcher_rutracker.py | 253 ++++ headphones/webserve.py | 8 +- lib/bs4/__init__.py | 355 +++++ lib/bs4/builder/__init__.py | 307 +++++ lib/bs4/builder/_html5lib.py | 222 ++++ lib/bs4/builder/_htmlparser.py | 244 ++++ lib/bs4/builder/_lxml.py | 179 +++ lib/bs4/dammit.py | 792 +++++++++++ lib/bs4/element.py | 1347 +++++++++++++++++++ lib/bs4/testing.py | 515 +++++++ lib/bs4/tests/__init__.py | 1 + lib/bs4/tests/test_builder_registry.py | 141 ++ lib/bs4/tests/test_docs.py | 36 + lib/bs4/tests/test_html5lib.py | 58 + lib/bs4/tests/test_htmlparser.py | 19 + lib/bs4/tests/test_lxml.py | 75 ++ lib/bs4/tests/test_soup.py | 368 +++++ lib/bs4/tests/test_tree.py | 1695 ++++++++++++++++++++++++ 22 files changed, 6716 insertions(+), 15 deletions(-) create mode 100644 headphones/searcher_rutracker.py create mode 100644 lib/bs4/__init__.py create mode 100644 lib/bs4/builder/__init__.py create mode 100644 lib/bs4/builder/_html5lib.py create mode 100644 lib/bs4/builder/_htmlparser.py create mode 100644 lib/bs4/builder/_lxml.py create mode 100644 lib/bs4/dammit.py create mode 100644 lib/bs4/element.py create mode 100644 lib/bs4/testing.py create mode 100644 lib/bs4/tests/__init__.py create mode 100644 lib/bs4/tests/test_builder_registry.py create mode 100644 lib/bs4/tests/test_docs.py create mode 100644 lib/bs4/tests/test_html5lib.py create mode 100644 lib/bs4/tests/test_htmlparser.py create mode 100644 lib/bs4/tests/test_lxml.py create mode 100644 lib/bs4/tests/test_soup.py create mode 100644 lib/bs4/tests/test_tree.py diff --git a/data/interfaces/default/config.html b/data/interfaces/default/config.html index 75219f33..d599471b 100644 --- a/data/interfaces/default/config.html +++ b/data/interfaces/default/config.html @@ -302,6 +302,19 @@ m<%inherit file="base.html"/> +
+ +
+
+
+ + +
+
+ + +
+
@@ -923,6 +936,7 @@ m<%inherit file="base.html"/> initConfigCheckbox("#usenewzbin"); initConfigCheckbox("#usenzbsorg"); initConfigCheckbox("#usewaffles"); + initConfigCheckbox("#userutracker"); initConfigCheckbox("#useblackhole"); initConfigCheckbox("#useapi"); } diff --git a/data/interfaces/default/history.html b/data/interfaces/default/history.html index 162dab2d..616459cc 100644 --- a/data/interfaces/default/history.html +++ b/data/interfaces/default/history.html @@ -45,6 +45,8 @@ fileid = 'nzb' if item['URL'].find('torrent') != -1: fileid = 'torrent' + if item['URL'].find('rutracker') != -1: + fileid = 'torrent' %> ${item['DateAdded']} diff --git a/headphones/__init__.py b/headphones/__init__.py index 3126e793..1157f26e 100644 --- a/headphones/__init__.py +++ b/headphones/__init__.py @@ -154,6 +154,9 @@ MININOVA = None WAFFLES = None WAFFLES_UID = None WAFFLES_PASSKEY = None +RUTRACKER = None +RUTRACKER_USER = None +RUTRACKER_PASSWORD = None DOWNLOAD_TORRENT_DIR = None INTERFACE = None @@ -247,7 +250,7 @@ def initialize(): LOSSLESS_DESTINATION_DIR, PREFERRED_QUALITY, PREFERRED_BITRATE, DETECT_BITRATE, ADD_ARTISTS, CORRECT_METADATA, MOVE_FILES, \ RENAME_FILES, FOLDER_FORMAT, FILE_FORMAT, CLEANUP_FILES, INCLUDE_EXTRAS, EXTRAS, AUTOWANT_UPCOMING, AUTOWANT_ALL, \ ADD_ALBUM_ART, EMBED_ALBUM_ART, EMBED_LYRICS, DOWNLOAD_DIR, BLACKHOLE, BLACKHOLE_DIR, USENET_RETENTION, SEARCH_INTERVAL, \ - TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, DOWNLOAD_TORRENT_DIR, \ + TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, RUTRACKER, RUTRACKER_USER, RUTRACKER_PASSWORD, DOWNLOAD_TORRENT_DIR, \ LIBRARYSCAN_INTERVAL, DOWNLOAD_SCAN_INTERVAL, SAB_HOST, SAB_USERNAME, SAB_PASSWORD, SAB_APIKEY, SAB_CATEGORY, \ NZBMATRIX, NZBMATRIX_USERNAME, NZBMATRIX_APIKEY, NEWZNAB, NEWZNAB_HOST, NEWZNAB_APIKEY, NEWZNAB_ENABLED, EXTRA_NEWZNABS,\ NZBSORG, NZBSORG_UID, NZBSORG_HASH, NEWZBIN, NEWZBIN_UID, NEWZBIN_PASSWORD, LASTFM_USERNAME, INTERFACE, FOLDER_PERMISSIONS, \ @@ -268,6 +271,7 @@ def initialize(): CheckSection('NZBsorg') CheckSection('Newzbin') CheckSection('Waffles') + CheckSection('Rutracker') CheckSection('Prowl') CheckSection('XBMC') CheckSection('NMA') @@ -341,6 +345,10 @@ def initialize(): WAFFLES = bool(check_setting_int(CFG, 'Waffles', 'waffles', 0)) WAFFLES_UID = check_setting_str(CFG, 'Waffles', 'waffles_uid', '') WAFFLES_PASSKEY = check_setting_str(CFG, 'Waffles', 'waffles_passkey', '') + + RUTRACKER = bool(check_setting_int(CFG, 'Rutracker', 'rutracker', 0)) + RUTRACKER_USER = check_setting_str(CFG, 'Rutracker', 'rutracker_user', '') + RUTRACKER_PASSWORD = check_setting_str(CFG, 'Rutracker', 'rutracker_password', '') SAB_HOST = check_setting_str(CFG, 'SABnzbd', 'sab_host', '') SAB_USERNAME = check_setting_str(CFG, 'SABnzbd', 'sab_username', '') @@ -618,6 +626,11 @@ def config_write(): new_config['Waffles']['waffles'] = int(WAFFLES) new_config['Waffles']['waffles_uid'] = WAFFLES_UID new_config['Waffles']['waffles_passkey'] = WAFFLES_PASSKEY + + new_config['Rutracker'] = {} + new_config['Rutracker']['rutracker'] = int(RUTRACKER) + new_config['Rutracker']['rutracker_user'] = RUTRACKER_USER + new_config['Rutracker']['rutracker_password'] = RUTRACKER_PASSWORD new_config['General']['search_interval'] = SEARCH_INTERVAL new_config['General']['libraryscan_interval'] = LIBRARYSCAN_INTERVAL diff --git a/headphones/searcher.py b/headphones/searcher.py index 7470c790..643b6afa 100644 --- a/headphones/searcher.py +++ b/headphones/searcher.py @@ -28,6 +28,9 @@ from headphones import logger, db, helpers, classes, sab import lib.bencode as bencode +import headphones.searcher_rutracker as rutrackersearch +rutracker = rutrackersearch.Rutracker() + class NewzbinDownloader(urllib.FancyURLopener): def __init__(self): @@ -97,7 +100,7 @@ def searchforalbum(albumid=None, new=False, lossless=False): else: foundNZB = searchNZB(result['AlbumID'], new) - if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none": + if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none": if result['Status'] == "Wanted Lossless": searchTorrent(result['AlbumID'], new, losslessOnly=True) else: @@ -109,7 +112,7 @@ def searchforalbum(albumid=None, new=False, lossless=False): if (headphones.NZBMATRIX or headphones.NEWZNAB or headphones.NZBSORG or headphones.NEWZBIN) and (headphones.SAB_HOST or headphones.BLACKHOLE): foundNZB = searchNZB(albumid, new, lossless) - if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none": + if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none": searchTorrent(albumid, new, lossless) def searchNZB(albumid=None, new=False, losslessOnly=False): @@ -632,6 +635,13 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): results = myDB.select('SELECT ArtistName, AlbumTitle, AlbumID, ReleaseDate from albums WHERE Status="Wanted" OR Status="Wanted Lossless"') new = True + # rutracker login + + if headphones.RUTRACKER and results: + rulogin = rutracker.login(headphones.RUTRACKER_USER, headphones.RUTRACKER_PASSWORD) + if not rulogin: + logger.info(u'Could not login to rutracker, search results will exclude this provider') + for albums in results: albumid = albums[2] @@ -806,7 +816,46 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): except Exception, e: logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s" % e) - + + # rutracker.org + + if headphones.RUTRACKER and rulogin: + + provider = "rutracker.org" + bitrate = False + + if headphones.PREFERRED_QUALITY == 3 or losslessOnly: + format = 'lossless' + maxsize = 10000000000 + elif headphones.PREFERRED_QUALITY == 1: + format = 'lossless+mp3' + maxsize = 10000000000 + else: + format = 'mp3' + maxsize = 300000000 + if headphones.PREFERRED_QUALITY == 2 and headphones.PREFERRED_BITRATE: + bitrate = True + + # build search url based on above + + searchURL = rutracker.searchurl(artistterm, albumterm, year, format) + logger.info(u'Parsing results from rutracker.org' % searchURL) + + # parse results and get best match + rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid, bitrate) + + # add best match to overall results list + + if rulist: + for ru in rulist: + title = ru[0] + size = ru[1] + url = ru[2] + resultlist.append((title, size, url, provider)) + logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size))) + else: + logger.info(u"No valid results found from %s" % (provider)) + if headphones.ISOHUNT: provider = "isoHunt" @@ -1029,19 +1078,24 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): # Get torrent name from .torrent, this is usually used by the torrent client as the folder name - torrent_name = torrent_folder_name + '.torrent' download_path = os.path.join(headphones.TORRENTBLACKHOLE_DIR, torrent_name) try: - #Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name. - torrent_file = open(download_path, 'wb') - torrent_file.write(data) - torrent_file.close() - #Open the fresh torrent file again so we can extract the proper torrent name - #Used later in post-processing. - torrent_file = open(download_path, 'rb') + if bestqual[3] == 'rutracker.org': + download_path = rutracker.get_torrent(bestqual[2], headphones.TORRENTBLACKHOLE_DIR) + if not download_path: + break + else: + #Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name. + torrent_file = open(download_path, 'wb') + torrent_file.write(data) + torrent_file.close() + + #Open the fresh torrent file again so we can extract the proper torrent name + #Used later in post-processing. + torrent_file = open(download_path, 'rb') torrent_info = bencode.bdecode(torrent_file.read()) - torrent_file.close() + torrent_file.close() torrent_folder_name = torrent_info['info'].get('name','').decode('utf-8') logger.info('Torrent folder name: %s' % torrent_folder_name) except Exception, e: @@ -1058,7 +1112,12 @@ def preprocesstorrent(resultlist): selresult = result elif int(selresult[1]) < int(result[1]): # if size is lower than new result replace previous selected result (bigger size = better quality?) selresult = result - + + # get outta here if rutracker + + if selresult[3] == 'rutracker.org': + return True, selresult + try: request = urllib2.Request(selresult[2]) request.add_header('Accept-encoding', 'gzip') diff --git a/headphones/searcher_rutracker.py b/headphones/searcher_rutracker.py new file mode 100644 index 00000000..dcdd60a3 --- /dev/null +++ b/headphones/searcher_rutracker.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Headphones rutracker.org search +# Functions called from searcher.py +# Requires BeautifulSoup 4 for parsing http://www.crummy.com/software/BeautifulSoup/ + +import urllib +import urllib2 +import cookielib +from urlparse import urlparse +from bs4 import BeautifulSoup +from headphones import logger, db +import lib.bencode as bencode +import os + +class Rutracker(): + + logged_in = False + # Stores a number of login attempts to prevent recursion. + login_counter = 0 + + def __init__(self): + + self.cookiejar = cookielib.CookieJar() + self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar)) + urllib2.install_opener(self.opener) + + def login(self, login, password): + """Implements tracker login procedure.""" + + self.logged_in = False + + if login is None or password is None: + return False + + self.login_counter += 1 + + # No recursion wanted. + #if self.login_counter > 1: + # return False + + params = urllib.urlencode({"login_username" : login, + "login_password" : password, + "login" : "Вход"}) + + try: + self.opener.open("http://login.rutracker.org/forum/login.php", params) + except : + pass + + # Check if we're logged in + + for cookie in self.cookiejar: + if cookie.name == 'bb_data': + self.logged_in = True + + return self.logged_in + + def searchurl(self, artist, album, year, format): + """ + Return the search url + """ + + # Build search url + + searchterm = artist + searchterm = searchterm + ' ' + searchterm = searchterm + album + searchterm = searchterm + ' ' + searchterm = searchterm + year + + providerurl = "http://rutracker.org/forum/tracker.php" + + if format == 'lossless': + format = '+lossless' + elif format == 'lossless+mp3': + format = '+lossless||mp3||aac' + else: + format = '+mp3||aac' + + # sort by size, descending. + + sort = '&o=7&s=2' + + searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort) + + return searchurl + + def search(self, searchurl, maxsize, minseeders, albumid, bitrate): + """ + Parse the search results and return the first valid torrent + """ + + titles = [] + urls = [] + seeders = [] + sizes = [] + torrentlist = [] + rulist = [] + + try: + + page = self.opener.open(searchurl, timeout=60) + soup = BeautifulSoup(page.read(), from_encoding="utf-8") + + # Debug + #logger.debug (soup.prettify()) + + # Title + + for link in soup.find_all('a', attrs={'class' : 'med tLink bold'}): + title = link.get_text() + titles.append(title) + + # Download URL + + for link in soup.find_all('a', attrs={'class' : 'small tr-dl dl-stub'}): + url = link.get('href') + urls.append(url) + + # Seeders + + for link in soup.find_all('td', attrs={'class' : 'row4 seedmed'}): + seeder = link.get_text() + seeders.append(seeder) + + # Size + + for link in soup.find_all('td', attrs={'class' : 'row4 small nowrap tor-size'}): + size = link.u.string + sizes.append(size) + + except : + pass + + # Combine lists + + torrentlist = zip(titles, urls, seeders, sizes) + + # return if nothing found + + if not torrentlist: + return False + + # get headphones track count for album, return if not found + + hptrackcount = 0 + + myDB = db.DBConnection() + tracks = myDB.select('SELECT TrackTitle from tracks WHERE AlbumID=?', [albumid]) + for track in tracks: + hptrackcount += 1 + + if not hptrackcount: + logger.info('headphones track info not found, cannot compare to torrent') + return False + + # Return the first valid torrent, unless we want a preferred bitrate then we want all valid entries + + for torrent in torrentlist: + + title = torrent[0] + url = torrent[1] + seeders = torrent[2] + size = torrent[3] + + # Attempt to filter out unwanted + + if 'Promo' not in title and 'promo' not in title and 'Vinyl' not in title and 'vinyl' not in title \ + and 'ongbook' not in title and 'TVRip' not in title and 'HDTV' not in title and 'DVD' not in title \ + and int(size) <= maxsize and int(seeders) >= minseeders: + + # Check torrent info + + torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] + self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) + + # Debug + for cookie in self.cookiejar: + logger.debug ('Cookie: %s' % cookie) + + try: + page = self.opener.open(url) + torrent = page.read() + if torrent: + decoded = bencode.bdecode(torrent) + metainfo = decoded['info'] + page.close () + except Exception, e: + logger.error('Error getting torrent: %s' % e) + return False + + # get torrent track count + + trackcount = 0 + + if 'files' in metainfo: # multi + for pathfile in metainfo['files']: + path = pathfile['path'] + for file in path: + if '.ape' in file or '.flac' in file or '.ogg' in file or '.m4a' in file or '.aac' in file or '.mp3' in file or '.wav' in file or '.aif' in file: + trackcount += 1 + + logger.debug ('torrent title: %s' % title) + logger.debug ('hp trackcount: %s' % hptrackcount) + logger.debug ('torrent trackcount: %s' % trackcount) + + #Torrent topic page + + topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id + + # If torrent track count = hp track count then return torrent, + # if greater, check for deluxe/special/foreign editions + + valid = False + + if trackcount == hptrackcount: + valid = True + elif trackcount > hptrackcount: + if 'eluxe' in title or 'dition' in title or 'apanese' in title or 'elease' in title: + valid = True + + # return 1st valid torrent if not checking by bitrate, else add to list and return at end + + if valid: + rulist.append((title, size, topicurl)) + if not bitrate: + return rulist + + return rulist + + + def get_torrent(self, url, savelocation): + + torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] + self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) + downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id + torrent_name = torrent_id + '.torrent' + download_path = os.path.join(savelocation, torrent_name) + + try: + page = self.opener.open(downloadurl) + torrent = page.read() + fp = open (download_path, 'wb') + fp.write (torrent) + fp.close () + except Exception, e: + logger.error('Error getting torrent: %s' % e) + return False + + return download_path + diff --git a/headphones/webserve.py b/headphones/webserve.py index 14f29a80..a6c3472a 100644 --- a/headphones/webserve.py +++ b/headphones/webserve.py @@ -463,6 +463,9 @@ class WebInterface(object): "use_waffles" : checked(headphones.WAFFLES), "waffles_uid" : headphones.WAFFLES_UID, "waffles_passkey": headphones.WAFFLES_PASSKEY, + "use_rutracker" : checked(headphones.RUTRACKER), + "rutracker_user" : headphones.RUTRACKER_USER, + "rutracker_password": headphones.RUTRACKER_PASSWORD, "pref_qual_0" : radio(headphones.PREFERRED_QUALITY, 0), "pref_qual_1" : radio(headphones.PREFERRED_QUALITY, 1), "pref_qual_3" : radio(headphones.PREFERRED_QUALITY, 3), @@ -544,7 +547,7 @@ class WebInterface(object): sab_category=None, download_dir=None, blackhole=0, blackhole_dir=None, usenet_retention=None, nzbmatrix=0, nzbmatrix_username=None, nzbmatrix_apikey=None, newznab=0, newznab_host=None, newznab_apikey=None, newznab_enabled=0, nzbsorg=0, nzbsorg_uid=None, nzbsorg_hash=None, newzbin=0, newzbin_uid=None, newzbin_password=None, preferred_quality=0, preferred_bitrate=None, detect_bitrate=0, move_files=0, torrentblackhole_dir=None, download_torrent_dir=None, - numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rename_files=0, correct_metadata=0, + numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rutracker=0, rutracker_user=None, rutracker_password=None, rename_files=0, correct_metadata=0, cleanup_files=0, add_album_art=0, embed_album_art=0, embed_lyrics=0, destination_dir=None, lossless_destination_dir=None, folder_format=None, file_format=None, include_extras=0, single=0, ep=0, compilation=0, soundtrack=0, live=0, remix=0, spokenword=0, audiobook=0, autowant_upcoming=False, autowant_all=False, interface=None, log_dir=None, music_encoder=0, encoder=None, bitrate=None, samplingfrequency=None, encoderfolder=None, advancedencoder=None, @@ -594,6 +597,9 @@ class WebInterface(object): headphones.WAFFLES = waffles headphones.WAFFLES_UID = waffles_uid headphones.WAFFLES_PASSKEY = waffles_passkey + headphones.RUTRACKER = rutracker + headphones.RUTRACKER_USER = rutracker_user + headphones.RUTRACKER_PASSWORD = rutracker_password headphones.PREFERRED_QUALITY = int(preferred_quality) headphones.PREFERRED_BITRATE = preferred_bitrate headphones.PREFERRED_BITRATE_HIGH_BUFFER = preferred_bitrate_high_buffer diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py new file mode 100644 index 00000000..af8c718d --- /dev/null +++ b/lib/bs4/__init__.py @@ -0,0 +1,355 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides provides methods and Pythonic idioms that make it easy to +navigate, search, and modify the parse tree. + +Beautiful Soup works with Python 2.6 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.1.0" +__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import re +import warnings + +from .builder import builder_registry +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, **kwargs): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. You can pass in features='html' " + "or features='xml' to get a builder capable of handling " + "one or the other.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if len(kwargs) > 0: + arg = kwargs.keys().pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + + if builder is None: + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise ValueError( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() + self.builder = builder + self.is_xml = builder.is_xml + self.builder.soup = self + + self.parse_only = parse_only + + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) = ( + self.builder.prepare_markup(markup, from_encoding)) + + try: + self._feed() + except StopParsing: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + """Create a new tag associated with this soup.""" + return Tag(None, self.builder, name, namespace, nsprefix, attrs) + + def new_string(self, s): + """Create a new NavigableString associated with this soup.""" + navigable = NavigableString(s) + navigable.setup() + return navigable + + def insert_before(self, successor): + raise ValueError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise ValueError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.builder.preserve_whitespace_tags)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(currentData)): + return + o = containerClass(currentData) + self.object_was_parsed(o) + + def object_was_parsed(self, o): + """Add an object to the parse tree.""" + o.setup(self.currentTag, self.previous_element) + if self.previous_element: + self.previous_element.next_element = o + self.previous_element = o + self.currentTag.contents.append(o) + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + + for i in range(len(self.tagStack) - 1, 0, -1): + if (name == self.tagStack[i].name + and nsprefix == self.tagStack[i].nsprefix == nsprefix): + numPops = len(self.tagStack) - i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self.previous_element) + if tag is None: + return tag + if self.previous_element: + self.previous_element.next_element = tag + self.previous_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.currentData.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py new file mode 100644 index 00000000..4c22b864 --- /dev/null +++ b/lib/bs4/builder/__init__.py @@ -0,0 +1,307 @@ +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + whitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + features = [] + + is_xml = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + cdata_list_attributes = {} + + + def __init__(self): + self.soup = None + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), []) + for cdata_list_attr in itertools.chain(universal, tag_specific): + if cdata_list_attr in dict(attrs): + # Basically, we have a "class" attribute whose + # value is a whitespace-separated list of CSS + # classes. Split it into a list. + value = attrs[cdata_list_attr] + values = whitespace_re.split(value) + attrs[cdata_list_attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + preserve_whitespace_tags = set(['pre', 'textarea']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py new file mode 100644 index 00000000..6001e386 --- /dev/null +++ b/lib/bs4/builder/_html5lib.py @@ -0,0 +1,222 @@ +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import NamespacedAttribute +import html5lib +from html5lib.constants import namespaces +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + features = ['html5lib', PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + return markup, None, None, False + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + doc = parser.parse(markup, encoding=self.user_specified_encoding) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment + + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # XXX This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + node.element) + old_element.replace_with(new_element) + else: + self.element.append(node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild( + Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild( + TextNode(child, self.soup)) + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py new file mode 100644 index 00000000..ede5cecb --- /dev/null +++ b/lib/bs4/builder/_htmlparser.py @@ -0,0 +1,244 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import ( + HTMLParser, + HTMLParseError, + ) +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = ( + major > 3 + or (major == 3 and minor > 2) + or (major == 3 and minor == 2 and release >= 3)) + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + def handle_starttag(self, name, attrs): + # XXX namespace + self.soup.handle_starttag(name, None, None, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed. + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + else: + real_name = int(name) + + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + data = u"\N{REPLACEMENT CHARACTER}" + + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + if data.endswith("?") and data.lower().startswith("xml"): + # "An XHTML processing instruction using the trailing '?' + # will cause the '?' to be included in data." - HTMLParser + # docs. + # + # Strip the question mark so we don't end up with two + # question marks. + data = data[:-1] + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py new file mode 100644 index 00000000..c78fdff6 --- /dev/null +++ b/lib/bs4/builder/_lxml.py @@ -0,0 +1,179 @@ +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from StringIO import StringIO +import collections +from lxml import etree +from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML) +from bs4.dammit import UnicodeDammit + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + @property + def default_parser(self): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + return etree.XMLParser(target=self, strip_cdata=False, recover=True) + + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + if parser is None: + # Use the default parser. + parser = self.default_parser + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False) + self.parser = parser + self.soup = None + self.nsmaps = None + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + if isinstance(markup, basestring): + markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) + while data != '': + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if data != '': + self.parser.feed(data) + self.parser.close() + + def close(self): + self.nsmaps = None + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and self.nsmaps != None: + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if self.nsmaps != None: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. + self.nsmaps = None + + def pi(self, target, data): + pass + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + features = [LXML, HTML, FAST, PERMISSIVE] + is_xml = False + + @property + def default_parser(self): + return etree.HTMLParser + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py new file mode 100644 index 00000000..58cad9ba --- /dev/null +++ b/lib/bs4/dammit.py @@ -0,0 +1,792 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's the tree builder's job. +""" + +import codecs +from htmlentitydefs import codepoint2name +import re +import warnings + +# Autodetects character encodings. Very useful. +# Download from http://chardet.feedparser.org/ +# or 'apt-get install python-chardet' +# or 'easy_install chardet' +try: + import chardet + #import chardet.constants + #chardet.constants._debug = 1 +except ImportError: + chardet = None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + for codepoint, name in list(codepoint2name.items()): + character = unichr(codepoint) + if codepoint != 34: + # There's no point in turning the quotation mark into + # ", unless it happens within an attribute value, which + # is handled elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to turn " into the quotation mark. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False): + self.declared_html_encoding = None + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + + if markup == '' or isinstance(markup, unicode): + self.markup = markup + self.unicode_markup = unicode(markup) + self.original_encoding = None + return + + new_markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, is_html) + self.markup = new_markup + + u = None + if new_markup != markup: + # _detectEncoding modified the markup, then converted it to + # Unicode and then to UTF-8. So convert it from UTF-8. + u = self._convert_from("utf8") + self.original_encoding = sniffed_encoding + + if not u: + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convert_from(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convert_from(proposed_encoding) + if u: + break + + # As an absolute last resort, try the encodings again with + # character replacement. + if not u: + for proposed_encoding in ( + override_encodings + [ + document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): + if proposed_encoding != "ascii": + u = self._convert_from(proposed_encoding, "replace") + if u is not None: + warnings.warn( + UnicodeWarning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER.")) + self.contains_replacement_characters = True + break + + # We could at this point force it to ASCII, but that would + # destroy so much data that I think giving up is better + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding, errors) + return newdata + + def _detectEncoding(self, xml_data, is_html=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == b'\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == b'\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ + and (xml_data[2:4] != b'\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ + (xml_data[2:4] != b'\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == b'\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_match = xml_encoding_re.match(xml_data) + if not xml_encoding_match and is_html: + xml_encoding_match = html_meta_re.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if is_html: + self.declared_html_encoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', #   + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # ­ + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/lib/bs4/element.py b/lib/bs4/element.py new file mode 100644 index 00000000..91a40078 --- /dev/null +++ b/lib/bs4/element.py @@ -0,0 +1,1347 @@ +import collections +import re +import sys +import warnings +from bs4.dammit import EntitySubstitution + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +whitespace_re = re.compile("\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + # There are five possible values for the "formatter" argument passed in + # to methods like encode() and prettify(): + # + # "html" - All Unicode characters with corresponding HTML entities + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to + # XML entities: & < > + # None - The null formatter. Unicode characters are never + # converted to entities. This is not recommended, but it's + # faster than "minimal". + # A function - This function will be called on every string that + # needs to undergo entity substition + FORMATTERS = { + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, + None : None + } + + @classmethod + def format_string(self, s, formatter='minimal'): + """Format the given string using the given formatter.""" + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = s + else: + output = formatter(s) + return output + + def setup(self, parent=None, previous_element=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + self.next_element = None + self.previous_sibling = None + self.next_sibling = None + if self.parent is not None and self.parent.contents: + self.previous_sibling = self.parent.contents[-1] + self.previous_sibling.next_sibling = self + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if self.previous_element is not None: + self.previous_element.next_element = next_element + if next_element is not None: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if self.previous_sibling is not None: + self.previous_sibling.next_sibling = self.next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self): + "Finds the last element beneath this object to be parsed." + last_child = self + while hasattr(last_child, 'contents') and last_child.contents: + last_child = last_child.contents[-1] + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, basestring) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant() + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant() + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def insert_before(self, predecessor): + """Makes the given element the immediate predecessor of this one. + + The two elements will have the same parent, and the given element + will be immediately before this one. + """ + if self is predecessor: + raise ValueError("Can't insert an element before itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, successor): + """Makes the given element the immediate successor of this one. + + The two elements will have the same parent, and the given element + will be immediately after this one. + """ + if self is successor: + raise ValueError("Can't insert an element after itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1, successor) + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + elif text is None and not limit and not attrs and not kwargs: + # Optimization to find all tags. + if name is True or name is None: + return [element for element in generator + if isinstance(element, Tag)] + # Optimization to find all tags with a given name. + elif isinstance(name, basestring): + return [element for element in generator + if isinstance(element, Tag) and element.name == name] + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'=?"?(?P[^\]"]*)"?\]$' + ) + + def _attr_value_as_string(self, value, default=None): + """Force an attribute value into a string representation. + + A multi-valued attribute will be converted into a + space-separated stirng. + """ + value = self.get(value, default) + if isinstance(value, list) or isinstance(value, tuple): + value =" ".join(value) + return value + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of `attribute` is equal to `value` + return lambda el: el._attr_value_as_string(attribute) == value + elif operator == '~': + # space-separated list representation of `attribute` + # contains `value` + def _includes_value(element): + attribute_value = element.get(attribute, []) + if not isinstance(attribute_value, list): + attribute_value = attribute_value.split() + return value in attribute_value + return _includes_value + elif operator == '^': + # string representation of `attribute` starts with `value` + return lambda el: el._attr_value_as_string( + attribute, '').startswith(value) + elif operator == '$': + # string represenation of `attribute` ends with `value` + return lambda el: el._attr_value_as_string( + attribute, '').endswith(value) + elif operator == '*': + # string representation of `attribute` contains `value` + return lambda el: value in el._attr_value_as_string(attribute, '') + elif operator == '|': + # string representation of `attribute` is either exactly + # `value` or starts with `value` and then a dash. + def _is_or_starts_with_dash(element): + attribute_value = element._attr_value_as_string(attribute, '') + return (attribute_value == value or attribute_value.startswith( + value + '-')) + return _is_or_starts_with_dash + else: + return lambda el: el.has_attr(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + tokens = selector.split() + current_context = [self] + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + # already found direct descendants in last step. skip this + # step. + continue + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend( + [el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + + if '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + if not tag_name: + tag_name = True + classes = set(klass.split('.')) + found = [] + def classes_match(tag): + if tag_name is not True and tag.name != tag_name: + return False + if not tag.has_attr('class'): + return False + return classes.issubset(tag['class']) + for context in current_context: + found.extend(context.find_all(classes_match)) + current_context = found + continue + + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + current_context = found + continue + + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(unicode, PageElement): + + PREFIX = '' + SUFFIX = '' + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter="minimal"): + """CData strings are passed into the formatter. + But the return value is ignored.""" + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = u'' + +class ProcessingInstruction(PreformattedString): + + PREFIX = u'' + +class Comment(PreformattedString): + + PREFIX = u'' + + +class Declaration(PreformattedString): + PREFIX = u'' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = u'\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + # Set up any substitutions, such as the charset in a META tag. + if builder is not None: + builder.set_up_substitutions(self) + self.can_be_empty_element = builder.can_be_empty_element(name) + else: + self.can_be_empty_element = False + + parserClass = _alias("parser_class") # BS3 + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False): + """Yield all child strings, possibly stripping them.""" + for descendant in self.descendants: + if not isinstance(descendant, NavigableString): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings(strip)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag=="contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + attrs = [] + if self.attrs: + for key, val in sorted(self.attrs.items()): + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None): + val = val.encode(eventual_encoding) + + text = self.format_string(val, formatter) + decoded = ( + str(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + if self.is_empty_element: + close = '/' + else: + closeTag = '' % self.name + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + pretty_print = (indent_level is not None) + if pretty_print: + space = (' ' * (indent_level - 1)) + indent_contents = indent_level + 1 + else: + space = '' + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if pretty_print: + s.append(space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if pretty_print and closeTag and self.next_sibling: + s.append("\n") + s = ''.join(s) + return s + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + if text and indent_level: + text = text.strip() + if text: + if pretty_print: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring.""" + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + # This was kind of misleading because has_key() (attributes) was + # different from __in__ (contents). has_key() is gone in Python 3, + # anyway. + has_key = has_attr + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in attrs.items(): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, unicode)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return unicode(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, collections.Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, basestring): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + if (isinstance(match_against, unicode) + and ' ' in match_against): + # A bit of a special case. If they try to match "foo + # bar" on a multivalue attribute's value, only accept + # the literal value "foo bar" + # + # XXX This is going to be pretty slow because we keep + # splitting match_against. But it shouldn't come up + # too often. + return (whitespace_re.split(match_against) == markup) + else: + for item in markup: + if self._matches(item, match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, collections.Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if isinstance(match_against, unicode): + # Exact string match + return markup == match_against + + if hasattr(match_against, 'match'): + # Regexp match + return match_against.search(markup) + + if hasattr(match_against, '__iter__'): + # The markup must be an exact match against something + # in the iterable. + return markup in match_against + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py new file mode 100644 index 00000000..5a84b0ba --- /dev/null +++ b/lib/bs4/testing.py @@ -0,0 +1,515 @@ +"""Helper classes for tests.""" + +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + SoupStrainer, +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder() + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder.test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment): + """Generate and parse a document with the given doctype.""" + doctype = '' % doctype_fragment + markup = doctype + '\n

foo

' + soup = self.soup(markup) + return doctype, soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A

tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("

") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "

") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("

", "

") + self.assertSoupEquals("", "") + + self.assertSoupEquals("
", "
") + + def test_br_is_always_empty_element_tag(self): + """A
tag is designated as an empty-element tag. + + Some parsers treat

as one
tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("

") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "
") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("") + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in
 and ")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "Inside a B tag"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('', '') + + def test_entities_in_attributes_converted_to_unicode(self): + expect = u'

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("

I said "good day!"

", + '

I said "good day!"

') + + def test_out_of_range_entity(self): + expect = u"\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'4' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A bold statement", + parse_only=strainer) + self.assertEqual(soup.decode(), "bold") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("", + '') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """a""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """a""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """a""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('', + '') + + self.assertSoupEquals( + 'foo', + 'foo') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'
'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'
') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + self.assertEqual(soup.original_encoding, 'iso8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("text") + data.a['foo'] = 'bar' + self.assertEqual('text', data.a.decode()) + +class XMLTreeBuilderSmokeTest(object): + + def test_docstring_generated(self): + soup = self.soup("") + self.assertEqual( + soup.encode(), b'\n') + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py new file mode 100644 index 00000000..142c8cc3 --- /dev/null +++ b/lib/bs4/tests/__init__.py @@ -0,0 +1 @@ +"The beautifulsoup tests." diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py new file mode 100644 index 00000000..92ad10fb --- /dev/null +++ b/lib/bs4/tests/test_builder_registry.py @@ -0,0 +1,141 @@ +"""Tests of the builder registry.""" + +import unittest + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + HTMLParserTreeBuilder, + TreeBuilderRegistry, +) + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError: + HTML5LIB_PRESENT = False + +try: + from bs4.builder import ( + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + ) + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + + if LXML_PRESENT: + self.assertEqual(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) + else: + self.assertEqual(registry.lookup('xml'), None) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) + + def test_named_library(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib'), + HTML5TreeBuilder) + + self.assertEqual(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + + def test_beautifulsoup_constructor_does_lookup(self): + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEqual(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEqual(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('foo'), builder) + self.assertEqual(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEqual(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEqual(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEqual(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/lib/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py new file mode 100644 index 00000000..5b9f6770 --- /dev/null +++ b/lib/bs4/tests/test_docs.py @@ -0,0 +1,36 @@ +"Test harness for doctests." + +# pylint: disable-msg=E0611,W0142 + +__metaclass__ = type +__all__ = [ + 'additional_tests', + ] + +import atexit +import doctest +import os +#from pkg_resources import ( +# resource_filename, resource_exists, resource_listdir, cleanup_resources) +import unittest + +DOCTEST_FLAGS = ( + doctest.ELLIPSIS | + doctest.NORMALIZE_WHITESPACE | + doctest.REPORT_NDIFF) + + +# def additional_tests(): +# "Run the doc tests (README.txt and docs/*, if any exist)" +# doctest_files = [ +# os.path.abspath(resource_filename('bs4', 'README.txt'))] +# if resource_exists('bs4', 'docs'): +# for name in resource_listdir('bs4', 'docs'): +# if name.endswith('.txt'): +# doctest_files.append( +# os.path.abspath( +# resource_filename('bs4', 'docs/%s' % name))) +# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) +# atexit.register(cleanup_resources) +# return unittest.TestSuite(( +# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py new file mode 100644 index 00000000..f195f7d0 --- /dev/null +++ b/lib/bs4/tests/test_html5lib.py @@ -0,0 +1,58 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import warnings + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError, e: + HTML5LIB_PRESENT = False +from bs4.element import SoupStrainer +from bs4.testing import ( + HTML5TreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) + self.assertEqual( + soup.decode(), self.document_for(markup)) + + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + + def test_correctly_nested_tables(self): + """html5lib inserts tags where other parsers don't.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py new file mode 100644 index 00000000..bcb5ed23 --- /dev/null +++ b/lib/bs4/tests/test_htmlparser.py @@ -0,0 +1,19 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest +from bs4.builder import HTMLParserTreeBuilder + +class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py new file mode 100644 index 00000000..39e26bfb --- /dev/null +++ b/lib/bs4/tests/test_lxml.py @@ -0,0 +1,75 @@ +"""Tests to ensure that the lxml tree builder generates good trees.""" + +import re +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True +except ImportError, e: + LXML_PRESENT = False + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, + ) +from bs4.element import Comment, Doctype, SoupStrainer +from bs4.testing import skipIf +from bs4.tests import test_htmlparser +from bs4.testing import ( + HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its tree builder.") +class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilder() + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. + with warnings.catch_warnings(record=False) as w: + soup = BeautifulStoneSoup("") + self.assertEqual(u"", unicode(soup.b)) + + def test_real_xhtml_document(self): + """lxml strips the XML definition from an XHTML doc, which is fine.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b''), + markup.replace(b'\n', b'').replace( + b'', b'')) + + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its XML tree builder.") +class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() + diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py new file mode 100644 index 00000000..23a664e7 --- /dev/null +++ b/lib/bs4/tests/test_soup.py @@ -0,0 +1,368 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +import unittest +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, +) +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + SoupStrainer, + NamespacedAttribute, + ) +import bs4.dammit +from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.testing import ( + SoupTest, + skipIf, +) +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True +except ImportError, e: + LXML_PRESENT = False + +class TestDeprecatedConstructorArguments(SoupTest): + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("
", parseOnlyThese=SoupStrainer("b")) + msg = str(w[0].message) + self.assertTrue("parseOnlyThese" in msg) + self.assertTrue("parse_only" in msg) + self.assertEqual(b"", soup.encode()) + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = self.soup(utf8, fromEncoding="utf8") + msg = str(w[0].message) + self.assertTrue("fromEncoding" in msg) + self.assertTrue("from_encoding" in msg) + self.assertEqual("utf8", soup.original_encoding) + + def test_unrecognized_keyword_argument(self): + self.assertRaises( + TypeError, self.soup, "", no_such_argument=True) + + @skipIf( + not LXML_PRESENT, + "lxml not present, not testing BeautifulStoneSoup.") + def test_beautifulstonesoup(self): + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("") + self.assertTrue(isinstance(soup, BeautifulSoup)) + self.assertTrue("BeautifulStoneSoup class is deprecated") + +class TestSelectiveParsing(SoupTest): + + def test_parse_with_soupstrainer(self): + markup = "NoYesNoYes Yes" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.encode(), b"YesYes Yes") + + +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), + u"foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEqual(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEqual(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + self.assertEqual( + self.sub.substitute_xml(s, True), + '"Welcome to "Bob\'s Bar""') + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEqual(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEqual( + self.sub.substitute_xml("foo"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml("ÁT&T"), + "ÁT&T") + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEqual(self.sub.substitute_html(text), text) + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setUp(self): + super(TestEncodingConversion, self).setUp() + self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( + self.utf8_data, + b"Sacr\xc3\xa9 bleu!") + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set. + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding, "ascii") + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) + self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) + self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of Unicode, Dammit.""" + + def test_smart_quotes_to_unicode(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup) + self.assertEqual( + dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") + + def test_smart_quotes_to_xml_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_html_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_ascii(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") + self.assertEqual( + dammit.unicode_markup, """''""""") + + def test_detect_utf8(self): + utf8 = b"\xc3\xa9" + dammit = UnicodeDammit(utf8) + self.assertEqual(dammit.unicode_markup, u'\xe9') + self.assertEqual(dammit.original_encoding, 'utf-8') + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding, 'iso-8859-8') + self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEqual(dammit.original_encoding, 'utf-8') + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding, 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = u"Räksmörgås".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEqual(dammit.original_encoding, 'utf-8') + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'', + b"", + b"", + b""): + dammit = UnicodeDammit(data, is_html=True) + self.assertEqual( + "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277 +\330\250\330\252\330\261 +\310\322\321\220\312\321\355\344""" + chardet = bs4.dammit.chardet + try: + bs4.dammit.chardet = None + with warnings.catch_warnings(record=True) as w: + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + + msg = w[0].message + self.assertTrue(isinstance(msg, UnicodeWarning)) + self.assertTrue("Some characters could not be decoded" in str(msg)) + finally: + bs4.dammit.chardet = chardet + + def test_sniffed_xml_encoding(self): + # A document written in UTF-16LE will be converted by a different + # code path that sniffs the byte order markers. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual(u"áé", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + + fixed = UnicodeDammit.detwingle(doc) + self.assertEqual( + u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + self.assertTrue(input.endswith(b'\x93')) + output = UnicodeDammit.detwingle(input) + self.assertEqual(output, input) + +class TestNamedspacedAttribute(SoupTest): + + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + self.assertEqual("a:b", a) + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + self.assertEqual(a, b) + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + self.assertEqual(a, c) + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + self.assertNotEqual(a, d) + + e = NamespacedAttribute("z", "b", "c") + self.assertNotEqual(a, e) + + +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): + + def test_content_meta_attribute_value(self): + value = CharsetMetaAttributeValue("euc-jp") + self.assertEqual("euc-jp", value) + self.assertEqual("euc-jp", value.original_value) + self.assertEqual("utf8", value.encode("utf8")) + + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + self.assertEqual("text/html; charset=euc-jp", value) + self.assertEqual("text/html; charset=euc-jp", value.original_value) + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py new file mode 100644 index 00000000..cc573ede --- /dev/null +++ b/lib/bs4/tests/test_tree.py @@ -0,0 +1,1695 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +import copy +import pickle +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry, + HTMLParserTreeBuilder, +) +from bs4.element import ( + CData, + Doctype, + NavigableString, + SoupStrainer, + Tag, +) +from bs4.testing import ( + SoupTest, + skipIf, +) + +XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) +LXML_PRESENT = (builder_registry.lookup("lxml") is not None) + +class TreeTest(SoupTest): + + def assertSelects(self, tags, should_match): + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag.string for tag in tags], should_match) + + def assertSelectsIDs(self, tags, should_match): + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag['id'] for tag in tags], should_match) + + +class TestFind(TreeTest): + """Basic tests of the find() method. + + find() just calls find_all() with limit=1, so it's not tested all + that thouroughly here. + """ + + def test_find_tag(self): + soup = self.soup("1234") + self.assertEqual(soup.find("b").string, "2") + + def test_unicode_text_find(self): + soup = self.soup(u'

Räksmörgås

') + self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') + +class TestFindAll(TreeTest): + """Basic tests of the find_all() method.""" + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. + self.assertEqual(soup.find_all(text="bar"), [u"bar"]) + # Match any of a number of strings. + self.assertEqual( + soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), + [u"Foo", u"bar", u'\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), + [u"Foo", u"bar", u'\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("12345") + self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) + self.assertSelects(soup.find_all('a', limit=1), ["1"]) + self.assertSelects( + soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assertSelects( + soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("123") + self.assertSelects(soup('a', limit=1), ["1"]) + self.assertSelects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): + soup = self.soup("") + # Create a self-referential list. + l = [] + l.append(l) + + # Without special code in _normalize_search_value, this would cause infinite + # recursion. + self.assertEqual([], soup.find_all(l)) + +class TestFindAllBasicNamespaces(TreeTest): + + def test_find_by_namespaced_name(self): + soup = self.soup('4') + self.assertEqual("4", soup.find("mathml:msqrt").string) + self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) + + +class TestFindAllByName(TreeTest): + """Test ways of finding tags by tag name.""" + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup("""First tag. + Second tag. + Third Nested tag. tag.""") + + def test_find_all_by_tag_name(self): + # Find all the tags. + self.assertSelects( + self.tree.find_all('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_name_and_text(self): + self.assertSelects( + self.tree.find_all('a', text='First tag.'), ['First tag.']) + + self.assertSelects( + self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) + + self.assertSelects( + self.tree.find_all('a', text=re.compile("tag")), + ['First tag.', 'Nested tag.']) + + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) + + def test_calling_element_invokes_find_all(self): + self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_strainer(self): + self.assertSelects( + self.tree.find_all(SoupStrainer('a')), + ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_names(self): + self.assertSelects( + self.tree.find_all(['a', 'b']), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_dict(self): + self.assertSelects( + self.tree.find_all({'a' : True, 'b' : True}), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_re(self): + self.assertSelects( + self.tree.find_all(re.compile('^[ab]$')), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get('id') + + tree = self.soup("""Match 1. + Does not match. + Match 2.""") + + self.assertSelects( + tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + +class TestFindAllByAttribute(TreeTest): + + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + Matching a. + + Non-matching Matching b.a. + """) + self.assertSelects(tree.find_all(id='first'), + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = u"םולש".encode("utf8") + data = u''.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) + self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + Name match. + Class match. + Non-match. + A tag called 'name1'. + """) + + # This doesn't do what you want. + self.assertSelects(tree.find_all(name='name1'), + ["A tag called 'name1'."]) + # This does what you want. + self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), + ["Name match."]) + + # Passing class='class2' would cause a syntax error. + self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), + ["Class match."]) + + def test_find_all_by_class(self): + # Passing in a string to 'attrs' will search the CSS class. + tree = self.soup(""" + Class 1. + Class 2. + Class 1. + Class 3 and 4. + """) + self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) + self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) + self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("Found it") + + attrs = { 'class' : re.compile("o") } + f = tree.find_all("gar", attrs=attrs) + self.assertSelects(f, ["Found it"]) + + f = tree.find_all("gar", re.compile("a")) + self.assertSelects(f, ["Found it"]) + + # Since the class is not the string "foo bar", but the two + # strings "foo" and "bar", this will not find anything. + attrs = { 'class' : re.compile("o b") } + f = tree.find_all("gar", attrs=attrs) + self.assertSelects(f, []) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("Found it") + + self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assertSelects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assertSelects( + soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('') + a, a2 = soup.find_all("a") + self.assertEqual([a, a2], soup.find_all("a", "foo")) + self.assertEqual([a], soup.find_all("a", "bar")) + + # If you specify the attribute as a string that contains a + # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", "foo bar")) + self.assertEqual([], soup.find_all("a", "bar foo")) + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + Match. + Non-match.""") + + strainer = SoupStrainer(attrs={'id' : 'first'}) + self.assertSelects(tree.find_all(strainer), ['Match.']) + + def test_find_all_with_missing_atribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects( + tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""Unquoted attribute. + Quoted attribute.""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assertSelects(tree.find_all(id=1), expected) + self.assertSelects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""1 + 2 + 3 + No ID.""") + self.assertSelects(tree.find_all(id=["1", "3", "4"]), + ["1", "3"]) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""One a. + Two as. + Mixed as and bs. + One b. + No ID.""") + + self.assertSelects(tree.find_all(id=re.compile("^a+$")), + ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("foobarfoo") + a = soup.a + + self.assertEqual([a], soup.find_all("a", text="foo")) + self.assertEqual([], soup.find_all("a", text="bar")) + self.assertEqual([], soup.find_all("a", text="bar")) + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("foofoo") + self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('foofoo') + a = soup.a + + self.assertEqual([a], soup.find_all(id=2, text="foo")) + self.assertEqual([], soup.find_all(id=1, text="bar")) + + + + +class TestIndex(TreeTest): + """Test Tag.index""" + def test_index(self): + tree = self.soup("""
+ Identical + Not identical + Identical + + Identical with child + Also not identical + Identical with child +
""") + div = tree.div + for i, element in enumerate(div.contents): + self.assertEqual(i, div.index(element)) + self.assertRaises(ValueError, tree.index, 1) + + +class TestParentOperations(TreeTest): + """Test navigation and searching through an element's parents.""" + + def setUp(self): + super(TestParentOperations, self).setUp() + self.tree = self.soup('''
    +
      +
        +
          + Start here +
        +
      ''') + self.start = self.tree.b + + + def test_parent(self): + self.assertEqual(self.start.parent['id'], 'bottom') + self.assertEqual(self.start.parent.parent['id'], 'middle') + self.assertEqual(self.start.parent.parent.parent['id'], 'top') + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + self.assertEqual(top_tag.parent, self.tree) + + def test_soup_object_has_no_parent(self): + self.assertEqual(None, self.tree.parent) + + def test_find_parents(self): + self.assertSelectsIDs( + self.start.find_parents('ul'), ['bottom', 'middle', 'top']) + self.assertSelectsIDs( + self.start.find_parents('ul', id="middle"), ['middle']) + + def test_find_parent(self): + self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') + + def test_parent_of_text_element(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.parent.name, 'b') + + def test_text_element_find_parent(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.find_parent('ul')['id'], 'bottom') + + def test_parent_generator(self): + parents = [parent['id'] for parent in self.start.parents + if parent is not None and 'id' in parent.attrs] + self.assertEqual(parents, ['bottom', 'middle', 'top']) + + +class ProximityTest(TreeTest): + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup( + 'OneTwoThree') + + +class TestNextOperations(ProximityTest): + + def setUp(self): + super(TestNextOperations, self).setUp() + self.start = self.tree.b + + def test_next(self): + self.assertEqual(self.start.next_element, "One") + self.assertEqual(self.start.next_element.next_element['id'], "2") + + def test_next_of_last_item_is_none(self): + last = self.tree.find(text="Three") + self.assertEqual(last.next_element, None) + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + self.assertEqual(self.tree.next_element, None) + + def test_find_all_next(self): + self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assertSelects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + self.assertEqual(self.start.find_next('b')['id'], '2') + self.assertEqual(self.start.find_next(text="Three"), "Three") + + def test_find_next_for_text_element(self): + text = self.tree.find(text="One") + self.assertEqual(text.find_next("b").string, "Two") + self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generator(self): + start = self.tree.find(text="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final tag and its text contents. + tag, contents = successors + self.assertEqual(tag['id'], '3') + self.assertEqual(contents, "Three") + +class TestPreviousOperations(ProximityTest): + + def setUp(self): + super(TestPreviousOperations, self).setUp() + self.end = self.tree.find(text="Three") + + def test_previous(self): + self.assertEqual(self.end.previous_element['id'], "3") + self.assertEqual(self.end.previous_element.previous_element, "Two") + + def test_previous_of_first_item_is_none(self): + first = self.tree.find('html') + self.assertEqual(first.previous_element, None) + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + # XXX This is broken! + #self.assertEqual(self.tree.previous_element, None) + pass + + def test_find_all_previous(self): + # The tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assertSelects( + self.end.find_all_previous('b'), ["Three", "Two", "One"]) + self.assertSelects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + self.assertEqual(self.end.find_previous('b')['id'], '3') + self.assertEqual(self.end.find_previous(text="One"), "One") + + def test_find_previous_for_text_element(self): + text = self.tree.find(text="Three") + self.assertEqual(text.find_previous("b").string, "Three") + self.assertSelects( + text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generator(self): + start = self.tree.find(text="One") + predecessors = [node for node in start.previous_elements] + + # There are four predecessors: the tag containing "One" + # the tag, the tag, and the tag. + b, body, head, html = predecessors + self.assertEqual(b['id'], '1') + self.assertEqual(body.name, "body") + self.assertEqual(head.name, "head") + self.assertEqual(html.name, "html") + + +class SiblingTest(TreeTest): + + def setUp(self): + super(SiblingTest, self).setUp() + markup = ''' + + + + + + + + + + + ''' + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile("\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + + def setUp(self): + super(TestNextSibling, self).setUp() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + self.assertEqual(self.tree.next_sibling, None) + + def test_next_sibling(self): + self.assertEqual(self.start.next_sibling['id'], '2') + self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') + + # Note the difference between next_sibling and next_element. + self.assertEqual(self.start.next_element['id'], '1.1') + + def test_next_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.next_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.next_sibling, None) + + last_span = self.tree.find(id="4") + self.assertEqual(last_span.next_sibling, None) + + def test_find_next_sibling(self): + self.assertEqual(self.start.find_next_sibling('span')['id'], '2') + + def test_next_siblings(self): + self.assertSelectsIDs(self.start.find_next_siblings("span"), + ['2', '3', '4']) + + self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="Foo") + self.assertEqual(start.next_sibling.name, 'b') + self.assertEqual(start.next_sibling.next_sibling, 'baz') + + self.assertSelects(start.find_next_siblings('b'), ['bar']) + self.assertEqual(start.find_next_sibling(text="baz"), "baz") + self.assertEqual(start.find_next_sibling(text="nonesuch"), None) + + +class TestPreviousSibling(SiblingTest): + + def setUp(self): + super(TestPreviousSibling, self).setUp() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + self.assertEqual(self.tree.previous_sibling, None) + + def test_previous_sibling(self): + self.assertEqual(self.end.previous_sibling['id'], '3') + self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') + + # Note the difference between previous_sibling and previous_element. + self.assertEqual(self.end.previous_element['id'], '3.1') + + def test_previous_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.previous_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.previous_sibling, None) + + first_span = self.tree.find(id="1") + self.assertEqual(first_span.previous_sibling, None) + + def test_find_previous_sibling(self): + self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') + + def test_previous_siblings(self): + self.assertSelectsIDs(self.end.find_previous_siblings("span"), + ['3', '2', '1']) + + self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="baz") + self.assertEqual(start.previous_sibling.name, 'b') + self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') + + self.assertSelects(start.find_previous_siblings('b'), ['bar']) + self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") + self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) + + +class TestTagCreation(SoupTest): + """Test the ability to create new tags.""" + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", bar="baz") + self.assertTrue(isinstance(new_tag, Tag)) + self.assertEqual("foo", new_tag.name) + self.assertEqual(dict(bar="baz"), new_tag.attrs) + self.assertEqual(None, new_tag.parent) + + def test_tag_inherits_self_closing_rules_from_builder(self): + if XML_BUILDER_PRESENT: + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the
      and

      tag are empty-element, just because + # they have no contents. + self.assertEqual(b"
      ", xml_br.encode()) + self.assertEqual(b"

      ", xml_p.encode()) + + html_soup = BeautifulSoup("", "html") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"
      ", html_br.encode()) + self.assertEqual(b"

      ", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) + +class TestTreeModification(SoupTest): + + def test_attribute_modification(self): + soup = self.soup('') + soup.a['id'] = 2 + self.assertEqual(soup.decode(), self.document_for('')) + del(soup.a['id']) + self.assertEqual(soup.decode(), self.document_for('')) + soup.a['id2'] = 'foo' + self.assertEqual(soup.decode(), self.document_for('')) + + def test_new_tag_creation(self): + builder = builder_registry.lookup('html')() + soup = self.soup("", builder=builder) + a = Tag(soup, builder, 'a') + ol = Tag(soup, builder, 'ol') + a['href'] = 'http://foo.com/' + soup.body.insert(0, a) + soup.body.insert(1, ol) + self.assertEqual( + soup.body.encode(), + b'
        ') + + def test_append_to_contents_moves_tag(self): + doc = """

        Don't leave me here.

        +

        Don\'t leave!

        """ + soup = self.soup(doc) + second_para = soup.find(id='2') + bold = soup.b + + # Move the tag to the end of the second paragraph. + soup.find(id='2').append(soup.b) + + # The tag is now a child of the second paragraph. + self.assertEqual(bold.parent, second_para) + + self.assertEqual( + soup.decode(), self.document_for( + '

        Don\'t leave me .

        \n' + '

        Don\'t leave!here

        ')) + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + self.assertEqual(a, new_a) + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + self.assertEqual(a, new_a) + + def test_replace_tag_with_itself(self): + text = "Foo" + soup = self.soup(text) + c = soup.c + soup.c.replace_with(c) + self.assertEqual(soup.decode(), self.document_for(text)) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.b.replace_with, soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup('

        onethree

        ') + a = soup.a + b = a.contents[0] + # Make it so the tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replaceWith('') + right.replaceWith('') + + # The tag is still connected to the tree. + self.assertEqual("three", soup.b.string) + + def test_replace_final_node(self): + soup = self.soup("Argh!") + soup.find(text="Argh!").replace_with("Hooray!") + new_text = soup.find(text="Hooray!") + b = soup.b + self.assertEqual(new_text.previous_element, b) + self.assertEqual(new_text.parent, b) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("Argh!") + soup.b.insert(1, "Hooray!") + + self.assertEqual( + soup.decode(), self.document_for( + "Argh!Hooray!")) + + new_text = soup.find(text="Hooray!") + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) + + self.assertEqual(new_text.previous_sibling, "Argh!") + self.assertEqual(new_text.previous_sibling.next_sibling, new_text) + + self.assertEqual(new_text.next_sibling, None) + self.assertEqual(new_text.next_element, soup.c) + + def test_insert_string(self): + soup = self.soup("") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + self.assertEqual(["foo", "bar"], soup.a.contents) + # And they were converted to NavigableStrings. + self.assertEqual(soup.a.contents[0].next_element, "bar") + + def test_insert_tag(self): + builder = self.default_builder + soup = self.soup( + "Findlady!", builder=builder) + magic_tag = Tag(soup, builder, 'magictag') + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Findthelady!")) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + self.assertEqual(b_tag.next_sibling, magic_tag) + self.assertEqual(magic_tag.previous_sibling, b_tag) + + find = b_tag.find(text="Find") + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) + + c_tag = soup.c + self.assertEqual(magic_tag.next_sibling, c_tag) + self.assertEqual(c_tag.previous_sibling, magic_tag) + + the = magic_tag.find(text="the") + self.assertEqual(the.parent, magic_tag) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) + + def test_append_child_thats_already_at_the_end(self): + data = "" + soup = self.soup(data) + soup.a.append(soup.b) + self.assertEqual(data, soup.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "" + soup = self.soup(data) + soup.a.insert(0, soup.d) + self.assertEqual("", soup.decode()) + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("
        ") + soup.br.insert(1, "Contents") + self.assertEqual(str(soup.br), "
        Contents
        ") + + def test_insert_before(self): + soup = self.soup("foobar") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + self.assertEqual( + soup.decode(), self.document_for("QUUXfooBAZbar")) + + soup.a.insert_before(soup.b) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after(self): + soup = self.soup("foobar") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + self.assertEqual( + soup.decode(), self.document_for("fooQUUXbarBAZ")) + soup.b.insert_after(soup.a) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after_raises_valueerror_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_after, tag) + self.assertRaises(ValueError, soup.insert_after, tag) + self.assertRaises(ValueError, tag.insert_after, tag) + + def test_insert_before_raises_valueerror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_before, tag) + self.assertRaises(ValueError, soup.insert_before, tag) + self.assertRaises(ValueError, tag.insert_before, tag) + + def test_replace_with(self): + soup = self.soup( + "

        There's no business like show business

        ") + no, show = soup.find_all('b') + show.replace_with(no) + self.assertEqual( + soup.decode(), + self.document_for( + "

        There's business like no business

        ")) + + self.assertEqual(show.parent, None) + self.assertEqual(no.parent, soup.p) + self.assertEqual(no.next_element, "no") + self.assertEqual(no.next_sibling, " business") + + def test_replace_first_child(self): + data = "" + soup = self.soup(data) + soup.b.replace_with(soup.c) + self.assertEqual("", soup.decode()) + + def test_replace_last_child(self): + data = "" + soup = self.soup(data) + soup.c.replace_with(soup.b) + self.assertEqual("", soup.decode()) + + def test_nested_tag_replace_with(self): + soup = self.soup( + """Wereservetherighttorefuseservice""") + + # Replace the entire tag and its contents ("reserve the + # right") with the tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Werefusetoservice")) + + # The tag is now an orphan. + self.assertEqual(remove_tag.parent, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) + self.assertEqual(remove_tag.next_sibling, None) + self.assertEqual(remove_tag.previous_sibling, None) + + # The tag is now connected to the tag. + self.assertEqual(move_tag.parent, soup.a) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) + self.assertEqual(move_tag.next_sibling, None) + + # The gap where the tag used to be has been mended, and + # the word "to" is now connected to the tag. + to_text = soup.find(text="to") + g_tag = soup.g + self.assertEqual(to_text.next_element, g_tag) + self.assertEqual(to_text.next_sibling, g_tag) + self.assertEqual(g_tag.previous_element, to_text) + self.assertEqual(g_tag.previous_sibling, to_text) + + def test_unwrap(self): + tree = self.soup(""" +

        Unneeded formatting is unneeded

        + """) + tree.em.unwrap() + self.assertEqual(tree.em, None) + self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + self.assertEqual(value.decode(), "I wish I was bold.") + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("I like being bold.I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual(2, len(soup.b.contents)) + self.assertEqual( + soup.decode(), self.document_for( + "I like being bold.I wish I was bold.")) + + def test_extract(self): + soup = self.soup( + 'Some content. More content.') + + self.assertEqual(len(soup.body.contents), 3) + extracted = soup.find(id="nav").extract() + + self.assertEqual( + soup.decode(), "Some content. More content.") + self.assertEqual(extracted.decode(), '') + + # The extracted tag is now an orphan. + self.assertEqual(len(soup.body.contents), 2) + self.assertEqual(extracted.parent, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(text="Some content. ") + content_2 = soup.find(text=" More content.") + self.assertEqual(content_1.next_element, content_2) + self.assertEqual(content_1.next_sibling, content_2) + self.assertEqual(content_2.previous_element, content_1) + self.assertEqual(content_2.previous_sibling, content_1) + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("
        foobar") + foo_1 = soup.a.string + bar_1 = soup.b.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the tag, and two + # in the tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + self.assertEqual(foo_2, soup.a.string) + self.assertEqual(bar_2, soup.b.string) + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("

        String Italicized and another

        ") + # clear using extract() + a = soup.a + soup.p.clear() + self.assertEqual(len(soup.p.contents), 0) + self.assertTrue(hasattr(a, "contents")) + + # clear using decompose() + em = a.em + a.clear(decompose=True) + self.assertFalse(hasattr(em, "contents")) + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup(" ") + soup.a.string = "foo" + self.assertEqual(soup.a.contents, ["foo"]) + soup.b.string = "bar" + self.assertEqual(soup.b.contents, ["bar"]) + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("foobar") + soup.b.string = soup.c.string + self.assertEqual(soup.a.encode(), b"barbar") + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("") + cdata = CData("foo") + soup.a.string = cdata + self.assertTrue(isinstance(soup.a.string, CData)) + +class TestElementObjects(SoupTest): + """Test various features of element objects.""" + + def test_len(self): + """The length of an element is its number of children.""" + soup = self.soup("123") + + # The BeautifulSoup object itself contains one element: the + # tag. + self.assertEqual(len(soup.contents), 1) + self.assertEqual(len(soup), 1) + + # The tag contains three elements: the text node "1", the + # tag, and the text node "3". + self.assertEqual(len(soup.top), 3) + self.assertEqual(len(soup.top.contents), 3) + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup('') + self.assertEqual(soup.b, soup.find('b')) + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.a, None) + + def test_deprecated_member_access(self): + soup = self.soup('') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + self.assertEqual(soup.b, tag) + self.assertEqual( + '.bTag is deprecated, use .find("b") instead.', + str(w[0].message)) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("") + self.assertTrue(soup.foo.has_attr('attr')) + self.assertFalse(soup.foo.has_attr('attr2')) + + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '' + self.assertSoupEquals(markup, '') + + def test_string(self): + # A tag that contains only a text node makes that node + # available as .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, 'foo') + + def test_empty_tag_has_no_string(self): + # A tag with no children has no .stirng. + soup = self.soup("") + self.assertEqual(soup.b.string, None) + + def test_tag_with_multiple_children_has_no_string(self): + # A tag with no children has no .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, None) + + soup = self.soup("foobar
        ") + self.assertEqual(soup.b.string, None) + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("foo
        ") + soup.a.insert(1, "bar") + self.assertEqual(soup.a.string, None) + + def test_tag_with_recursive_string_has_string(self): + # A tag with a single child which has a .string inherits that + # .string. + soup = self.soup("foo") + self.assertEqual(soup.a.string, "foo") + self.assertEqual(soup.string, "foo") + + def test_lack_of_string(self): + """Only a tag containing a single text node has a .string.""" + soup = self.soup("feo") + self.assertFalse(soup.b.string) + + soup = self.soup("") + self.assertFalse(soup.b.string) + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("ar t ") + self.assertEqual(soup.a.text, "ar t ") + self.assertEqual(soup.a.get_text(strip=True), "art") + self.assertEqual(soup.a.get_text(","), "a,r, , t ") + self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + +class TestCDAtaListAttributes(SoupTest): + + """Testing cdata-list attributes like 'class'. + """ + def test_single_value_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo"],soup.a['class']) + + def test_multiple_values_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo", "bar"], soup.a['class']) + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("") + self.assertEqual(["foo", "bar", "baz"],soup.a['class']) + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("") + self.assertEqual(b'', soup.a.encode()) + + def test_accept_charset(self): + soup = self.soup('
        ') + self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) + + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the tag. But it's not a cdata-list + # attribute for any other tag. + self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + super(TestPersistence, self).setUp() + self.page = """ + + + +Beautiful Soup: We called him Tortoise because he taught us. + + + + + + +foo +bar + +""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.tree.decode()) + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + self.assertEqual(copied.decode(), self.tree.decode()) + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = u"\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + +class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_html(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( + decoded, + self.document_for("<<Sacré bleu!>>")) + + def test_formatter_minimal(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_null(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, + self.document_for(u"<>")) + + def test_formatter_custom(self): + markup = u"<foo>bar" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, + self.document_for(u"BAR")) + + def test_formatter_is_run_on_attribute_values(self): + markup = u'e' + soup = self.soup(markup) + a = soup.a + + expect_minimal = u'e' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = u'e' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = u'E' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_prettify_accepts_formatter(self): + soup = BeautifulSoup("foo") + pretty = soup.prettify(formatter = lambda x: x.upper()) + self.assertTrue("FOO" in pretty) + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") + self.assertEqual(unicode, type(soup.prettify())) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + self.assertEqual(bytes, type(soup.prettify("utf-8"))) + + def test_html_entity_substitution_off_by_default(self): + markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEqual(encoded, markup.encode('utf-8')) + + def test_encoding_substitution(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + self.assertTrue(b"charset=utf-8" in utf_8) + + euc_jp = soup.encode("euc_jp") + self.assertTrue(b"charset=euc_jp" in euc_jp) + + shift_jis = soup.encode("shift-jis") + self.assertTrue(b"charset=shift-jis" in shift_jis) + + utf_16_u = soup.encode("utf-16").decode("utf-16") + self.assertTrue("charset=utf-16" in utf_16_u) + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('
        foo
        ') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.contents[0].name, 'pre') + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.string.encode("utf-8"), + u"\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + soup.b.encode("utf-8"), html.encode("utf-8")) + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"") + + def test_encoding_can_be_made_strict(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") + + def test_decode_contents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) + + def test_encode_contents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + encoding="utf8")) + + def test_deprecated_renderContents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"\n") + + +class TestSoupSelector(TreeTest): + + HTML = """ + + + +The title + + + + +
        +
        +

        An H1

        +

        Some text

        +

        Some more text

        +

        An H2

        +

        Another

        +Bob +

        Another H2

        +me + +span1a1 +span1a2 test + +span2a1 + + + +
        +

        English

        +

        English UK

        +

        English US

        +

        French

        +
        + + +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML) + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, [u'The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 3) + for div in els: + self.assertEqual(div.name, 'div') + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertEqual(len(self.soup.select('tag%t')), 0) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assertFalse(els[0].has_key('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), + ('div[id$="1"]', []), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('a[href*="http://"]', ['bob', 'me']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), + ('div[id*="1"]', []), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The
        tag was selected. The diff --git a/headphones/versioncheck.py b/headphones/versioncheck.py index 429ec8b9..e1440fd8 100644 --- a/headphones/versioncheck.py +++ b/headphones/versioncheck.py @@ -20,7 +20,7 @@ from headphones import logger, version import lib.simplejson as simplejson -user = "rembo10" +user = "AdeHub" branch = "master" def runGit(args): diff --git a/html5lib/__init__.py b/html5lib/__init__.py new file mode 100644 index 00000000..16537aad --- /dev/null +++ b/html5lib/__init__.py @@ -0,0 +1,17 @@ +""" +HTML parsing library based on the WHATWG "HTML5" +specification. The parser is designed to be compatible with existing +HTML found in the wild and implements well-defined error recovery that +is largely compatible with modern desktop web browsers. + +Example usage: + +import html5lib +f = open("my_document.html") +tree = html5lib.parse(f) +""" +__version__ = "0.95-dev" +from html5parser import HTMLParser, parse, parseFragment +from treebuilders import getTreeBuilder +from treewalkers import getTreeWalker +from serializer import serialize diff --git a/html5lib/constants.py b/html5lib/constants.py new file mode 100644 index 00000000..b533018e --- /dev/null +++ b/html5lib/constants.py @@ -0,0 +1,3085 @@ +import string, gettext +_ = gettext.gettext + +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +EOF = None + +E = { + "null-character": + _(u"Null character in input stream, replaced with U+FFFD."), + "invalid-codepoint": + _(u"Invalid codepoint in stream."), + "incorrectly-placed-solidus": + _(u"Solidus (/) incorrectly placed in tag."), + "incorrect-cr-newline-entity": + _(u"Incorrect CR newline entity, replaced with LF."), + "illegal-windows-1252-entity": + _(u"Entity used with illegal number (windows-1252 reference)."), + "cant-convert-numeric-entity": + _(u"Numeric entity couldn't be converted to character " + u"(codepoint U+%(charAsInt)08x)."), + "illegal-codepoint-for-numeric-entity": + _(u"Numeric entity represents an illegal codepoint: " + u"U+%(charAsInt)08x."), + "numeric-entity-without-semicolon": + _(u"Numeric entity didn't end with ';'."), + "expected-numeric-entity-but-got-eof": + _(u"Numeric entity expected. Got end of file instead."), + "expected-numeric-entity": + _(u"Numeric entity expected but none found."), + "named-entity-without-semicolon": + _(u"Named entity didn't end with ';'."), + "expected-named-entity": + _(u"Named entity expected. Got none."), + "attributes-in-end-tag": + _(u"End tag contains unexpected attributes."), + 'self-closing-flag-on-end-tag': + _(u"End tag contains unexpected self-closing flag."), + "expected-tag-name-but-got-right-bracket": + _(u"Expected tag name. Got '>' instead."), + "expected-tag-name-but-got-question-mark": + _(u"Expected tag name. Got '?' instead. (HTML doesn't " + u"support processing instructions.)"), + "expected-tag-name": + _(u"Expected tag name. Got something else instead"), + "expected-closing-tag-but-got-right-bracket": + _(u"Expected closing tag. Got '>' instead. Ignoring ''."), + "expected-closing-tag-but-got-eof": + _(u"Expected closing tag. Unexpected end of file."), + "expected-closing-tag-but-got-char": + _(u"Expected closing tag. Unexpected character '%(data)s' found."), + "eof-in-tag-name": + _(u"Unexpected end of file in the tag name."), + "expected-attribute-name-but-got-eof": + _(u"Unexpected end of file. Expected attribute name instead."), + "eof-in-attribute-name": + _(u"Unexpected end of file in attribute name."), + "invalid-character-in-attribute-name": + _(u"Invalid chracter in attribute name"), + "duplicate-attribute": + _(u"Dropped duplicate attribute on tag."), + "expected-end-of-tag-name-but-got-eof": + _(u"Unexpected end of file. Expected = or end of tag."), + "expected-attribute-value-but-got-eof": + _(u"Unexpected end of file. Expected attribute value."), + "expected-attribute-value-but-got-right-bracket": + _(u"Expected attribute value. Got '>' instead."), + 'equals-in-unquoted-attribute-value': + _(u"Unexpected = in unquoted attribute"), + 'unexpected-character-in-unquoted-attribute-value': + _(u"Unexpected character in unquoted attribute"), + "invalid-character-after-attribute-name": + _(u"Unexpected character after attribute name."), + "unexpected-character-after-attribute-value": + _(u"Unexpected character after attribute value."), + "eof-in-attribute-value-double-quote": + _(u"Unexpected end of file in attribute value (\")."), + "eof-in-attribute-value-single-quote": + _(u"Unexpected end of file in attribute value (')."), + "eof-in-attribute-value-no-quotes": + _(u"Unexpected end of file in attribute value."), + "unexpected-EOF-after-solidus-in-tag": + _(u"Unexpected end of file in tag. Expected >"), + "unexpected-character-after-soldius-in-tag": + _(u"Unexpected character after / in tag. Expected >"), + "expected-dashes-or-doctype": + _(u"Expected '--' or 'DOCTYPE'. Not found."), + "unexpected-bang-after-double-dash-in-comment": + _(u"Unexpected ! after -- in comment"), + "unexpected-space-after-double-dash-in-comment": + _(u"Unexpected space after -- in comment"), + "incorrect-comment": + _(u"Incorrect comment."), + "eof-in-comment": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-dash": + _(u"Unexpected end of file in comment (-)"), + "unexpected-dash-after-double-dash-in-comment": + _(u"Unexpected '-' after '--' found in comment."), + "eof-in-comment-double-dash": + _(u"Unexpected end of file in comment (--)."), + "eof-in-comment-end-space-state": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-bang-state": + _(u"Unexpected end of file in comment."), + "unexpected-char-in-comment": + _(u"Unexpected character in comment found."), + "need-space-after-doctype": + _(u"No space after literal string 'DOCTYPE'."), + "expected-doctype-name-but-got-right-bracket": + _(u"Unexpected > character. Expected DOCTYPE name."), + "expected-doctype-name-but-got-eof": + _(u"Unexpected end of file. Expected DOCTYPE name."), + "eof-in-doctype-name": + _(u"Unexpected end of file in DOCTYPE name."), + "eof-in-doctype": + _(u"Unexpected end of file in DOCTYPE."), + "expected-space-or-right-bracket-in-doctype": + _(u"Expected space or '>'. Got '%(data)s'"), + "unexpected-end-of-doctype": + _(u"Unexpected end of DOCTYPE."), + "unexpected-char-in-doctype": + _(u"Unexpected character in DOCTYPE."), + "eof-in-innerhtml": + _(u"XXX innerHTML EOF"), + "unexpected-doctype": + _(u"Unexpected DOCTYPE. Ignored."), + "non-html-root": + _(u"html needs to be the first start tag."), + "expected-doctype-but-got-eof": + _(u"Unexpected End of file. Expected DOCTYPE."), + "unknown-doctype": + _(u"Erroneous DOCTYPE."), + "expected-doctype-but-got-chars": + _(u"Unexpected non-space characters. Expected DOCTYPE."), + "expected-doctype-but-got-start-tag": + _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."), + "expected-doctype-but-got-end-tag": + _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."), + "end-tag-after-implied-root": + _(u"Unexpected end tag (%(name)s) after the (implied) root element."), + "expected-named-closing-tag-but-got-eof": + _(u"Unexpected end of file. Expected end tag (%(name)s)."), + "two-heads-are-not-better-than-one": + _(u"Unexpected start tag head in existing head. Ignored."), + "unexpected-end-tag": + _(u"Unexpected end tag (%(name)s). Ignored."), + "unexpected-start-tag-out-of-my-head": + _(u"Unexpected start tag (%(name)s) that can be in head. Moved."), + "unexpected-start-tag": + _(u"Unexpected start tag (%(name)s)."), + "missing-end-tag": + _(u"Missing end tag (%(name)s)."), + "missing-end-tags": + _(u"Missing end tags (%(name)s)."), + "unexpected-start-tag-implies-end-tag": + _(u"Unexpected start tag (%(startName)s) " + u"implies end tag (%(endName)s)."), + "unexpected-start-tag-treated-as": + _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "deprecated-tag": + _(u"Unexpected start tag %(name)s. Don't use it!"), + "unexpected-start-tag-ignored": + _(u"Unexpected start tag %(name)s. Ignored."), + "expected-one-end-tag-but-got-another": + _(u"Unexpected end tag (%(gotName)s). " + u"Missing end tag (%(expectedName)s)."), + "end-tag-too-early": + _(u"End tag (%(name)s) seen too early. Expected other end tag."), + "end-tag-too-early-named": + _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "end-tag-too-early-ignored": + _(u"End tag (%(name)s) seen too early. Ignored."), + "adoption-agency-1.1": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 1 of the adoption agency algorithm."), + "adoption-agency-1.2": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 2 of the adoption agency algorithm."), + "adoption-agency-1.3": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 3 of the adoption agency algorithm."), + "unexpected-end-tag-treated-as": + _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "no-end-tag": + _(u"This element (%(name)s) has no end tag."), + "unexpected-implied-end-tag-in-table": + _(u"Unexpected implied end tag (%(name)s) in the table phase."), + "unexpected-implied-end-tag-in-table-body": + _(u"Unexpected implied end tag (%(name)s) in the table body phase."), + "unexpected-char-implies-table-voodoo": + _(u"Unexpected non-space characters in " + u"table context caused voodoo mode."), + "unexpected-hidden-input-in-table": + _(u"Unexpected input with type hidden in table context."), + "unexpected-form-in-table": + _(u"Unexpected form in table context."), + "unexpected-start-tag-implies-table-voodoo": + _(u"Unexpected start tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-end-tag-implies-table-voodoo": + _(u"Unexpected end tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-cell-in-table-body": + _(u"Unexpected table cell start tag (%(name)s) " + u"in the table body phase."), + "unexpected-cell-end-tag": + _(u"Got table cell end tag (%(name)s) " + u"while required end tags are missing."), + "unexpected-end-tag-in-table-body": + _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "unexpected-implied-end-tag-in-table-row": + _(u"Unexpected implied end tag (%(name)s) in the table row phase."), + "unexpected-end-tag-in-table-row": + _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "unexpected-select-in-select": + _(u"Unexpected select start tag in the select phase " + u"treated as select end tag."), + "unexpected-input-in-select": + _(u"Unexpected input start tag in the select phase."), + "unexpected-start-tag-in-select": + _(u"Unexpected start tag token (%(name)s in the select phase. " + u"Ignored."), + "unexpected-end-tag-in-select": + _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."), + "unexpected-table-element-start-tag-in-select-in-table": + _(u"Unexpected table element start tag (%(name)s) in the select in table phase."), + "unexpected-table-element-end-tag-in-select-in-table": + _(u"Unexpected table element end tag (%(name)s) in the select in table phase."), + "unexpected-char-after-body": + _(u"Unexpected non-space characters in the after body phase."), + "unexpected-start-tag-after-body": + _(u"Unexpected start tag token (%(name)s)" + u" in the after body phase."), + "unexpected-end-tag-after-body": + _(u"Unexpected end tag token (%(name)s)" + u" in the after body phase."), + "unexpected-char-in-frameset": + _(u"Unepxected characters in the frameset phase. Characters ignored."), + "unexpected-start-tag-in-frameset": + _(u"Unexpected start tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-frameset-in-frameset-innerhtml": + _(u"Unexpected end tag token (frameset) " + u"in the frameset phase (innerHTML)."), + "unexpected-end-tag-in-frameset": + _(u"Unexpected end tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-char-after-frameset": + _(u"Unexpected non-space characters in the " + u"after frameset phase. Ignored."), + "unexpected-start-tag-after-frameset": + _(u"Unexpected start tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-frameset": + _(u"Unexpected end tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-body-innerhtml": + _(u"Unexpected end tag after body(innerHtml)"), + "expected-eof-but-got-char": + _(u"Unexpected non-space characters. Expected end of file."), + "expected-eof-but-got-start-tag": + _(u"Unexpected start tag (%(name)s)" + u". Expected end of file."), + "expected-eof-but-got-end-tag": + _(u"Unexpected end tag (%(name)s)" + u". Expected end of file."), + "eof-in-table": + _(u"Unexpected end of file. Expected table content."), + "eof-in-select": + _(u"Unexpected end of file. Expected select content."), + "eof-in-frameset": + _(u"Unexpected end of file. Expected frameset content."), + "eof-in-script-in-script": + _(u"Unexpected end of file. Expected script content."), + "eof-in-foreign-lands": + _(u"Unexpected end of file. Expected foreign content"), + "non-void-element-with-trailing-solidus": + _(u"Trailing solidus not allowed on element %(name)s"), + "unexpected-html-element-in-foreign-content": + _(u"Element %(name)s not allowed in a non-html context"), + "unexpected-end-tag-before-html": + _(u"Unexpected end tag (%(name)s) before html."), + "XXX-undefined-error": + (u"Undefined error (this sucks and should be fixed)"), +} + +namespaces = { + "html":"http://www.w3.org/1999/xhtml", + "mathml":"http://www.w3.org/1998/Math/MathML", + "svg":"http://www.w3.org/2000/svg", + "xlink":"http://www.w3.org/1999/xlink", + "xml":"http://www.w3.org/XML/1998/namespace", + "xmlns":"http://www.w3.org/2000/xmlns/" +} + +scopingElements = frozenset(( + (namespaces["html"], "applet"), + (namespaces["html"], "caption"), + (namespaces["html"], "html"), + (namespaces["html"], "marquee"), + (namespaces["html"], "object"), + (namespaces["html"], "table"), + (namespaces["html"], "td"), + (namespaces["html"], "th"), + (namespaces["mathml"], "mi"), + (namespaces["mathml"], "mo"), + (namespaces["mathml"], "mn"), + (namespaces["mathml"], "ms"), + (namespaces["mathml"], "mtext"), + (namespaces["mathml"], "annotation-xml"), + (namespaces["svg"], "foreignObject"), + (namespaces["svg"], "desc"), + (namespaces["svg"], "title"), +)) + +formattingElements = frozenset(( + (namespaces["html"], "a"), + (namespaces["html"], "b"), + (namespaces["html"], "big"), + (namespaces["html"], "code"), + (namespaces["html"], "em"), + (namespaces["html"], "font"), + (namespaces["html"], "i"), + (namespaces["html"], "nobr"), + (namespaces["html"], "s"), + (namespaces["html"], "small"), + (namespaces["html"], "strike"), + (namespaces["html"], "strong"), + (namespaces["html"], "tt"), + (namespaces["html"], "u") +)) + +specialElements = frozenset(( + (namespaces["html"], "address"), + (namespaces["html"], "applet"), + (namespaces["html"], "area"), + (namespaces["html"], "article"), + (namespaces["html"], "aside"), + (namespaces["html"], "base"), + (namespaces["html"], "basefont"), + (namespaces["html"], "bgsound"), + (namespaces["html"], "blockquote"), + (namespaces["html"], "body"), + (namespaces["html"], "br"), + (namespaces["html"], "button"), + (namespaces["html"], "caption"), + (namespaces["html"], "center"), + (namespaces["html"], "col"), + (namespaces["html"], "colgroup"), + (namespaces["html"], "command"), + (namespaces["html"], "dd"), + (namespaces["html"], "details"), + (namespaces["html"], "dir"), + (namespaces["html"], "div"), + (namespaces["html"], "dl"), + (namespaces["html"], "dt"), + (namespaces["html"], "embed"), + (namespaces["html"], "fieldset"), + (namespaces["html"], "figure"), + (namespaces["html"], "footer"), + (namespaces["html"], "form"), + (namespaces["html"], "frame"), + (namespaces["html"], "frameset"), + (namespaces["html"], "h1"), + (namespaces["html"], "h2"), + (namespaces["html"], "h3"), + (namespaces["html"], "h4"), + (namespaces["html"], "h5"), + (namespaces["html"], "h6"), + (namespaces["html"], "head"), + (namespaces["html"], "header"), + (namespaces["html"], "hr"), + (namespaces["html"], "html"), + (namespaces["html"], "iframe"), + # Note that image is commented out in the spec as "this isn't an + # element that can end up on the stack, so it doesn't matter," + (namespaces["html"], "image"), + (namespaces["html"], "img"), + (namespaces["html"], "input"), + (namespaces["html"], "isindex"), + (namespaces["html"], "li"), + (namespaces["html"], "link"), + (namespaces["html"], "listing"), + (namespaces["html"], "marquee"), + (namespaces["html"], "menu"), + (namespaces["html"], "meta"), + (namespaces["html"], "nav"), + (namespaces["html"], "noembed"), + (namespaces["html"], "noframes"), + (namespaces["html"], "noscript"), + (namespaces["html"], "object"), + (namespaces["html"], "ol"), + (namespaces["html"], "p"), + (namespaces["html"], "param"), + (namespaces["html"], "plaintext"), + (namespaces["html"], "pre"), + (namespaces["html"], "script"), + (namespaces["html"], "section"), + (namespaces["html"], "select"), + (namespaces["html"], "style"), + (namespaces["html"], "table"), + (namespaces["html"], "tbody"), + (namespaces["html"], "td"), + (namespaces["html"], "textarea"), + (namespaces["html"], "tfoot"), + (namespaces["html"], "th"), + (namespaces["html"], "thead"), + (namespaces["html"], "title"), + (namespaces["html"], "tr"), + (namespaces["html"], "ul"), + (namespaces["html"], "wbr"), + (namespaces["html"], "xmp"), + (namespaces["svg"], "foreignObject") +)) + +htmlIntegrationPointElements = frozenset(( + (namespaces["mathml"], "annotaion-xml"), + (namespaces["svg"], "foreignObject"), + (namespaces["svg"], "desc"), + (namespaces["svg"], "title") +)) + +mathmlTextIntegrationPointElements = frozenset(( + (namespaces["mathml"], "mi"), + (namespaces["mathml"], "mo"), + (namespaces["mathml"], "mn"), + (namespaces["mathml"], "ms"), + (namespaces["mathml"], "mtext") +)) + +spaceCharacters = frozenset(( + u"\t", + u"\n", + u"\u000C", + u" ", + u"\r" +)) + +tableInsertModeElements = frozenset(( + "table", + "tbody", + "tfoot", + "thead", + "tr" +)) + +asciiLowercase = frozenset(string.ascii_lowercase) +asciiUppercase = frozenset(string.ascii_uppercase) +asciiLetters = frozenset(string.ascii_letters) +digits = frozenset(string.digits) +hexDigits = frozenset(string.hexdigits) + +asciiUpper2Lower = dict([(ord(c),ord(c.lower())) + for c in string.ascii_uppercase]) + +# Heading elements need to be ordered +headingElements = ( + "h1", + "h2", + "h3", + "h4", + "h5", + "h6" +) + +voidElements = frozenset(( + "base", + "command", + "event-source", + "link", + "meta", + "hr", + "br", + "img", + "embed", + "param", + "area", + "col", + "input", + "source", + "track" +)) + +cdataElements = frozenset(('title', 'textarea')) + +rcdataElements = frozenset(( + 'style', + 'script', + 'xmp', + 'iframe', + 'noembed', + 'noframes', + 'noscript' +)) + +booleanAttributes = { + "": frozenset(("irrelevant",)), + "style": frozenset(("scoped",)), + "img": frozenset(("ismap",)), + "audio": frozenset(("autoplay","controls")), + "video": frozenset(("autoplay","controls")), + "script": frozenset(("defer", "async")), + "details": frozenset(("open",)), + "datagrid": frozenset(("multiple", "disabled")), + "command": frozenset(("hidden", "disabled", "checked", "default")), + "hr": frozenset(("noshade")), + "menu": frozenset(("autosubmit",)), + "fieldset": frozenset(("disabled", "readonly")), + "option": frozenset(("disabled", "readonly", "selected")), + "optgroup": frozenset(("disabled", "readonly")), + "button": frozenset(("disabled", "autofocus")), + "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), + "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), + "output": frozenset(("disabled", "readonly")), +} + +# entitiesWindows1252 has to be _ordered_ and needs to have an index. It +# therefore can't be a frozenset. +entitiesWindows1252 = ( + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS +) + +xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) + +entities = { + "AElig": u"\xc6", + "AElig;": u"\xc6", + "AMP": u"&", + "AMP;": u"&", + "Aacute": u"\xc1", + "Aacute;": u"\xc1", + "Abreve;": u"\u0102", + "Acirc": u"\xc2", + "Acirc;": u"\xc2", + "Acy;": u"\u0410", + "Afr;": u"\U0001d504", + "Agrave": u"\xc0", + "Agrave;": u"\xc0", + "Alpha;": u"\u0391", + "Amacr;": u"\u0100", + "And;": u"\u2a53", + "Aogon;": u"\u0104", + "Aopf;": u"\U0001d538", + "ApplyFunction;": u"\u2061", + "Aring": u"\xc5", + "Aring;": u"\xc5", + "Ascr;": u"\U0001d49c", + "Assign;": u"\u2254", + "Atilde": u"\xc3", + "Atilde;": u"\xc3", + "Auml": u"\xc4", + "Auml;": u"\xc4", + "Backslash;": u"\u2216", + "Barv;": u"\u2ae7", + "Barwed;": u"\u2306", + "Bcy;": u"\u0411", + "Because;": u"\u2235", + "Bernoullis;": u"\u212c", + "Beta;": u"\u0392", + "Bfr;": u"\U0001d505", + "Bopf;": u"\U0001d539", + "Breve;": u"\u02d8", + "Bscr;": u"\u212c", + "Bumpeq;": u"\u224e", + "CHcy;": u"\u0427", + "COPY": u"\xa9", + "COPY;": u"\xa9", + "Cacute;": u"\u0106", + "Cap;": u"\u22d2", + "CapitalDifferentialD;": u"\u2145", + "Cayleys;": u"\u212d", + "Ccaron;": u"\u010c", + "Ccedil": u"\xc7", + "Ccedil;": u"\xc7", + "Ccirc;": u"\u0108", + "Cconint;": u"\u2230", + "Cdot;": u"\u010a", + "Cedilla;": u"\xb8", + "CenterDot;": u"\xb7", + "Cfr;": u"\u212d", + "Chi;": u"\u03a7", + "CircleDot;": u"\u2299", + "CircleMinus;": u"\u2296", + "CirclePlus;": u"\u2295", + "CircleTimes;": u"\u2297", + "ClockwiseContourIntegral;": u"\u2232", + "CloseCurlyDoubleQuote;": u"\u201d", + "CloseCurlyQuote;": u"\u2019", + "Colon;": u"\u2237", + "Colone;": u"\u2a74", + "Congruent;": u"\u2261", + "Conint;": u"\u222f", + "ContourIntegral;": u"\u222e", + "Copf;": u"\u2102", + "Coproduct;": u"\u2210", + "CounterClockwiseContourIntegral;": u"\u2233", + "Cross;": u"\u2a2f", + "Cscr;": u"\U0001d49e", + "Cup;": u"\u22d3", + "CupCap;": u"\u224d", + "DD;": u"\u2145", + "DDotrahd;": u"\u2911", + "DJcy;": u"\u0402", + "DScy;": u"\u0405", + "DZcy;": u"\u040f", + "Dagger;": u"\u2021", + "Darr;": u"\u21a1", + "Dashv;": u"\u2ae4", + "Dcaron;": u"\u010e", + "Dcy;": u"\u0414", + "Del;": u"\u2207", + "Delta;": u"\u0394", + "Dfr;": u"\U0001d507", + "DiacriticalAcute;": u"\xb4", + "DiacriticalDot;": u"\u02d9", + "DiacriticalDoubleAcute;": u"\u02dd", + "DiacriticalGrave;": u"`", + "DiacriticalTilde;": u"\u02dc", + "Diamond;": u"\u22c4", + "DifferentialD;": u"\u2146", + "Dopf;": u"\U0001d53b", + "Dot;": u"\xa8", + "DotDot;": u"\u20dc", + "DotEqual;": u"\u2250", + "DoubleContourIntegral;": u"\u222f", + "DoubleDot;": u"\xa8", + "DoubleDownArrow;": u"\u21d3", + "DoubleLeftArrow;": u"\u21d0", + "DoubleLeftRightArrow;": u"\u21d4", + "DoubleLeftTee;": u"\u2ae4", + "DoubleLongLeftArrow;": u"\u27f8", + "DoubleLongLeftRightArrow;": u"\u27fa", + "DoubleLongRightArrow;": u"\u27f9", + "DoubleRightArrow;": u"\u21d2", + "DoubleRightTee;": u"\u22a8", + "DoubleUpArrow;": u"\u21d1", + "DoubleUpDownArrow;": u"\u21d5", + "DoubleVerticalBar;": u"\u2225", + "DownArrow;": u"\u2193", + "DownArrowBar;": u"\u2913", + "DownArrowUpArrow;": u"\u21f5", + "DownBreve;": u"\u0311", + "DownLeftRightVector;": u"\u2950", + "DownLeftTeeVector;": u"\u295e", + "DownLeftVector;": u"\u21bd", + "DownLeftVectorBar;": u"\u2956", + "DownRightTeeVector;": u"\u295f", + "DownRightVector;": u"\u21c1", + "DownRightVectorBar;": u"\u2957", + "DownTee;": u"\u22a4", + "DownTeeArrow;": u"\u21a7", + "Downarrow;": u"\u21d3", + "Dscr;": u"\U0001d49f", + "Dstrok;": u"\u0110", + "ENG;": u"\u014a", + "ETH": u"\xd0", + "ETH;": u"\xd0", + "Eacute": u"\xc9", + "Eacute;": u"\xc9", + "Ecaron;": u"\u011a", + "Ecirc": u"\xca", + "Ecirc;": u"\xca", + "Ecy;": u"\u042d", + "Edot;": u"\u0116", + "Efr;": u"\U0001d508", + "Egrave": u"\xc8", + "Egrave;": u"\xc8", + "Element;": u"\u2208", + "Emacr;": u"\u0112", + "EmptySmallSquare;": u"\u25fb", + "EmptyVerySmallSquare;": u"\u25ab", + "Eogon;": u"\u0118", + "Eopf;": u"\U0001d53c", + "Epsilon;": u"\u0395", + "Equal;": u"\u2a75", + "EqualTilde;": u"\u2242", + "Equilibrium;": u"\u21cc", + "Escr;": u"\u2130", + "Esim;": u"\u2a73", + "Eta;": u"\u0397", + "Euml": u"\xcb", + "Euml;": u"\xcb", + "Exists;": u"\u2203", + "ExponentialE;": u"\u2147", + "Fcy;": u"\u0424", + "Ffr;": u"\U0001d509", + "FilledSmallSquare;": u"\u25fc", + "FilledVerySmallSquare;": u"\u25aa", + "Fopf;": u"\U0001d53d", + "ForAll;": u"\u2200", + "Fouriertrf;": u"\u2131", + "Fscr;": u"\u2131", + "GJcy;": u"\u0403", + "GT": u">", + "GT;": u">", + "Gamma;": u"\u0393", + "Gammad;": u"\u03dc", + "Gbreve;": u"\u011e", + "Gcedil;": u"\u0122", + "Gcirc;": u"\u011c", + "Gcy;": u"\u0413", + "Gdot;": u"\u0120", + "Gfr;": u"\U0001d50a", + "Gg;": u"\u22d9", + "Gopf;": u"\U0001d53e", + "GreaterEqual;": u"\u2265", + "GreaterEqualLess;": u"\u22db", + "GreaterFullEqual;": u"\u2267", + "GreaterGreater;": u"\u2aa2", + "GreaterLess;": u"\u2277", + "GreaterSlantEqual;": u"\u2a7e", + "GreaterTilde;": u"\u2273", + "Gscr;": u"\U0001d4a2", + "Gt;": u"\u226b", + "HARDcy;": u"\u042a", + "Hacek;": u"\u02c7", + "Hat;": u"^", + "Hcirc;": u"\u0124", + "Hfr;": u"\u210c", + "HilbertSpace;": u"\u210b", + "Hopf;": u"\u210d", + "HorizontalLine;": u"\u2500", + "Hscr;": u"\u210b", + "Hstrok;": u"\u0126", + "HumpDownHump;": u"\u224e", + "HumpEqual;": u"\u224f", + "IEcy;": u"\u0415", + "IJlig;": u"\u0132", + "IOcy;": u"\u0401", + "Iacute": u"\xcd", + "Iacute;": u"\xcd", + "Icirc": u"\xce", + "Icirc;": u"\xce", + "Icy;": u"\u0418", + "Idot;": u"\u0130", + "Ifr;": u"\u2111", + "Igrave": u"\xcc", + "Igrave;": u"\xcc", + "Im;": u"\u2111", + "Imacr;": u"\u012a", + "ImaginaryI;": u"\u2148", + "Implies;": u"\u21d2", + "Int;": u"\u222c", + "Integral;": u"\u222b", + "Intersection;": u"\u22c2", + "InvisibleComma;": u"\u2063", + "InvisibleTimes;": u"\u2062", + "Iogon;": u"\u012e", + "Iopf;": u"\U0001d540", + "Iota;": u"\u0399", + "Iscr;": u"\u2110", + "Itilde;": u"\u0128", + "Iukcy;": u"\u0406", + "Iuml": u"\xcf", + "Iuml;": u"\xcf", + "Jcirc;": u"\u0134", + "Jcy;": u"\u0419", + "Jfr;": u"\U0001d50d", + "Jopf;": u"\U0001d541", + "Jscr;": u"\U0001d4a5", + "Jsercy;": u"\u0408", + "Jukcy;": u"\u0404", + "KHcy;": u"\u0425", + "KJcy;": u"\u040c", + "Kappa;": u"\u039a", + "Kcedil;": u"\u0136", + "Kcy;": u"\u041a", + "Kfr;": u"\U0001d50e", + "Kopf;": u"\U0001d542", + "Kscr;": u"\U0001d4a6", + "LJcy;": u"\u0409", + "LT": u"<", + "LT;": u"<", + "Lacute;": u"\u0139", + "Lambda;": u"\u039b", + "Lang;": u"\u27ea", + "Laplacetrf;": u"\u2112", + "Larr;": u"\u219e", + "Lcaron;": u"\u013d", + "Lcedil;": u"\u013b", + "Lcy;": u"\u041b", + "LeftAngleBracket;": u"\u27e8", + "LeftArrow;": u"\u2190", + "LeftArrowBar;": u"\u21e4", + "LeftArrowRightArrow;": u"\u21c6", + "LeftCeiling;": u"\u2308", + "LeftDoubleBracket;": u"\u27e6", + "LeftDownTeeVector;": u"\u2961", + "LeftDownVector;": u"\u21c3", + "LeftDownVectorBar;": u"\u2959", + "LeftFloor;": u"\u230a", + "LeftRightArrow;": u"\u2194", + "LeftRightVector;": u"\u294e", + "LeftTee;": u"\u22a3", + "LeftTeeArrow;": u"\u21a4", + "LeftTeeVector;": u"\u295a", + "LeftTriangle;": u"\u22b2", + "LeftTriangleBar;": u"\u29cf", + "LeftTriangleEqual;": u"\u22b4", + "LeftUpDownVector;": u"\u2951", + "LeftUpTeeVector;": u"\u2960", + "LeftUpVector;": u"\u21bf", + "LeftUpVectorBar;": u"\u2958", + "LeftVector;": u"\u21bc", + "LeftVectorBar;": u"\u2952", + "Leftarrow;": u"\u21d0", + "Leftrightarrow;": u"\u21d4", + "LessEqualGreater;": u"\u22da", + "LessFullEqual;": u"\u2266", + "LessGreater;": u"\u2276", + "LessLess;": u"\u2aa1", + "LessSlantEqual;": u"\u2a7d", + "LessTilde;": u"\u2272", + "Lfr;": u"\U0001d50f", + "Ll;": u"\u22d8", + "Lleftarrow;": u"\u21da", + "Lmidot;": u"\u013f", + "LongLeftArrow;": u"\u27f5", + "LongLeftRightArrow;": u"\u27f7", + "LongRightArrow;": u"\u27f6", + "Longleftarrow;": u"\u27f8", + "Longleftrightarrow;": u"\u27fa", + "Longrightarrow;": u"\u27f9", + "Lopf;": u"\U0001d543", + "LowerLeftArrow;": u"\u2199", + "LowerRightArrow;": u"\u2198", + "Lscr;": u"\u2112", + "Lsh;": u"\u21b0", + "Lstrok;": u"\u0141", + "Lt;": u"\u226a", + "Map;": u"\u2905", + "Mcy;": u"\u041c", + "MediumSpace;": u"\u205f", + "Mellintrf;": u"\u2133", + "Mfr;": u"\U0001d510", + "MinusPlus;": u"\u2213", + "Mopf;": u"\U0001d544", + "Mscr;": u"\u2133", + "Mu;": u"\u039c", + "NJcy;": u"\u040a", + "Nacute;": u"\u0143", + "Ncaron;": u"\u0147", + "Ncedil;": u"\u0145", + "Ncy;": u"\u041d", + "NegativeMediumSpace;": u"\u200b", + "NegativeThickSpace;": u"\u200b", + "NegativeThinSpace;": u"\u200b", + "NegativeVeryThinSpace;": u"\u200b", + "NestedGreaterGreater;": u"\u226b", + "NestedLessLess;": u"\u226a", + "NewLine;": u"\n", + "Nfr;": u"\U0001d511", + "NoBreak;": u"\u2060", + "NonBreakingSpace;": u"\xa0", + "Nopf;": u"\u2115", + "Not;": u"\u2aec", + "NotCongruent;": u"\u2262", + "NotCupCap;": u"\u226d", + "NotDoubleVerticalBar;": u"\u2226", + "NotElement;": u"\u2209", + "NotEqual;": u"\u2260", + "NotEqualTilde;": u"\u2242\u0338", + "NotExists;": u"\u2204", + "NotGreater;": u"\u226f", + "NotGreaterEqual;": u"\u2271", + "NotGreaterFullEqual;": u"\u2267\u0338", + "NotGreaterGreater;": u"\u226b\u0338", + "NotGreaterLess;": u"\u2279", + "NotGreaterSlantEqual;": u"\u2a7e\u0338", + "NotGreaterTilde;": u"\u2275", + "NotHumpDownHump;": u"\u224e\u0338", + "NotHumpEqual;": u"\u224f\u0338", + "NotLeftTriangle;": u"\u22ea", + "NotLeftTriangleBar;": u"\u29cf\u0338", + "NotLeftTriangleEqual;": u"\u22ec", + "NotLess;": u"\u226e", + "NotLessEqual;": u"\u2270", + "NotLessGreater;": u"\u2278", + "NotLessLess;": u"\u226a\u0338", + "NotLessSlantEqual;": u"\u2a7d\u0338", + "NotLessTilde;": u"\u2274", + "NotNestedGreaterGreater;": u"\u2aa2\u0338", + "NotNestedLessLess;": u"\u2aa1\u0338", + "NotPrecedes;": u"\u2280", + "NotPrecedesEqual;": u"\u2aaf\u0338", + "NotPrecedesSlantEqual;": u"\u22e0", + "NotReverseElement;": u"\u220c", + "NotRightTriangle;": u"\u22eb", + "NotRightTriangleBar;": u"\u29d0\u0338", + "NotRightTriangleEqual;": u"\u22ed", + "NotSquareSubset;": u"\u228f\u0338", + "NotSquareSubsetEqual;": u"\u22e2", + "NotSquareSuperset;": u"\u2290\u0338", + "NotSquareSupersetEqual;": u"\u22e3", + "NotSubset;": u"\u2282\u20d2", + "NotSubsetEqual;": u"\u2288", + "NotSucceeds;": u"\u2281", + "NotSucceedsEqual;": u"\u2ab0\u0338", + "NotSucceedsSlantEqual;": u"\u22e1", + "NotSucceedsTilde;": u"\u227f\u0338", + "NotSuperset;": u"\u2283\u20d2", + "NotSupersetEqual;": u"\u2289", + "NotTilde;": u"\u2241", + "NotTildeEqual;": u"\u2244", + "NotTildeFullEqual;": u"\u2247", + "NotTildeTilde;": u"\u2249", + "NotVerticalBar;": u"\u2224", + "Nscr;": u"\U0001d4a9", + "Ntilde": u"\xd1", + "Ntilde;": u"\xd1", + "Nu;": u"\u039d", + "OElig;": u"\u0152", + "Oacute": u"\xd3", + "Oacute;": u"\xd3", + "Ocirc": u"\xd4", + "Ocirc;": u"\xd4", + "Ocy;": u"\u041e", + "Odblac;": u"\u0150", + "Ofr;": u"\U0001d512", + "Ograve": u"\xd2", + "Ograve;": u"\xd2", + "Omacr;": u"\u014c", + "Omega;": u"\u03a9", + "Omicron;": u"\u039f", + "Oopf;": u"\U0001d546", + "OpenCurlyDoubleQuote;": u"\u201c", + "OpenCurlyQuote;": u"\u2018", + "Or;": u"\u2a54", + "Oscr;": u"\U0001d4aa", + "Oslash": u"\xd8", + "Oslash;": u"\xd8", + "Otilde": u"\xd5", + "Otilde;": u"\xd5", + "Otimes;": u"\u2a37", + "Ouml": u"\xd6", + "Ouml;": u"\xd6", + "OverBar;": u"\u203e", + "OverBrace;": u"\u23de", + "OverBracket;": u"\u23b4", + "OverParenthesis;": u"\u23dc", + "PartialD;": u"\u2202", + "Pcy;": u"\u041f", + "Pfr;": u"\U0001d513", + "Phi;": u"\u03a6", + "Pi;": u"\u03a0", + "PlusMinus;": u"\xb1", + "Poincareplane;": u"\u210c", + "Popf;": u"\u2119", + "Pr;": u"\u2abb", + "Precedes;": u"\u227a", + "PrecedesEqual;": u"\u2aaf", + "PrecedesSlantEqual;": u"\u227c", + "PrecedesTilde;": u"\u227e", + "Prime;": u"\u2033", + "Product;": u"\u220f", + "Proportion;": u"\u2237", + "Proportional;": u"\u221d", + "Pscr;": u"\U0001d4ab", + "Psi;": u"\u03a8", + "QUOT": u"\"", + "QUOT;": u"\"", + "Qfr;": u"\U0001d514", + "Qopf;": u"\u211a", + "Qscr;": u"\U0001d4ac", + "RBarr;": u"\u2910", + "REG": u"\xae", + "REG;": u"\xae", + "Racute;": u"\u0154", + "Rang;": u"\u27eb", + "Rarr;": u"\u21a0", + "Rarrtl;": u"\u2916", + "Rcaron;": u"\u0158", + "Rcedil;": u"\u0156", + "Rcy;": u"\u0420", + "Re;": u"\u211c", + "ReverseElement;": u"\u220b", + "ReverseEquilibrium;": u"\u21cb", + "ReverseUpEquilibrium;": u"\u296f", + "Rfr;": u"\u211c", + "Rho;": u"\u03a1", + "RightAngleBracket;": u"\u27e9", + "RightArrow;": u"\u2192", + "RightArrowBar;": u"\u21e5", + "RightArrowLeftArrow;": u"\u21c4", + "RightCeiling;": u"\u2309", + "RightDoubleBracket;": u"\u27e7", + "RightDownTeeVector;": u"\u295d", + "RightDownVector;": u"\u21c2", + "RightDownVectorBar;": u"\u2955", + "RightFloor;": u"\u230b", + "RightTee;": u"\u22a2", + "RightTeeArrow;": u"\u21a6", + "RightTeeVector;": u"\u295b", + "RightTriangle;": u"\u22b3", + "RightTriangleBar;": u"\u29d0", + "RightTriangleEqual;": u"\u22b5", + "RightUpDownVector;": u"\u294f", + "RightUpTeeVector;": u"\u295c", + "RightUpVector;": u"\u21be", + "RightUpVectorBar;": u"\u2954", + "RightVector;": u"\u21c0", + "RightVectorBar;": u"\u2953", + "Rightarrow;": u"\u21d2", + "Ropf;": u"\u211d", + "RoundImplies;": u"\u2970", + "Rrightarrow;": u"\u21db", + "Rscr;": u"\u211b", + "Rsh;": u"\u21b1", + "RuleDelayed;": u"\u29f4", + "SHCHcy;": u"\u0429", + "SHcy;": u"\u0428", + "SOFTcy;": u"\u042c", + "Sacute;": u"\u015a", + "Sc;": u"\u2abc", + "Scaron;": u"\u0160", + "Scedil;": u"\u015e", + "Scirc;": u"\u015c", + "Scy;": u"\u0421", + "Sfr;": u"\U0001d516", + "ShortDownArrow;": u"\u2193", + "ShortLeftArrow;": u"\u2190", + "ShortRightArrow;": u"\u2192", + "ShortUpArrow;": u"\u2191", + "Sigma;": u"\u03a3", + "SmallCircle;": u"\u2218", + "Sopf;": u"\U0001d54a", + "Sqrt;": u"\u221a", + "Square;": u"\u25a1", + "SquareIntersection;": u"\u2293", + "SquareSubset;": u"\u228f", + "SquareSubsetEqual;": u"\u2291", + "SquareSuperset;": u"\u2290", + "SquareSupersetEqual;": u"\u2292", + "SquareUnion;": u"\u2294", + "Sscr;": u"\U0001d4ae", + "Star;": u"\u22c6", + "Sub;": u"\u22d0", + "Subset;": u"\u22d0", + "SubsetEqual;": u"\u2286", + "Succeeds;": u"\u227b", + "SucceedsEqual;": u"\u2ab0", + "SucceedsSlantEqual;": u"\u227d", + "SucceedsTilde;": u"\u227f", + "SuchThat;": u"\u220b", + "Sum;": u"\u2211", + "Sup;": u"\u22d1", + "Superset;": u"\u2283", + "SupersetEqual;": u"\u2287", + "Supset;": u"\u22d1", + "THORN": u"\xde", + "THORN;": u"\xde", + "TRADE;": u"\u2122", + "TSHcy;": u"\u040b", + "TScy;": u"\u0426", + "Tab;": u"\t", + "Tau;": u"\u03a4", + "Tcaron;": u"\u0164", + "Tcedil;": u"\u0162", + "Tcy;": u"\u0422", + "Tfr;": u"\U0001d517", + "Therefore;": u"\u2234", + "Theta;": u"\u0398", + "ThickSpace;": u"\u205f\u200a", + "ThinSpace;": u"\u2009", + "Tilde;": u"\u223c", + "TildeEqual;": u"\u2243", + "TildeFullEqual;": u"\u2245", + "TildeTilde;": u"\u2248", + "Topf;": u"\U0001d54b", + "TripleDot;": u"\u20db", + "Tscr;": u"\U0001d4af", + "Tstrok;": u"\u0166", + "Uacute": u"\xda", + "Uacute;": u"\xda", + "Uarr;": u"\u219f", + "Uarrocir;": u"\u2949", + "Ubrcy;": u"\u040e", + "Ubreve;": u"\u016c", + "Ucirc": u"\xdb", + "Ucirc;": u"\xdb", + "Ucy;": u"\u0423", + "Udblac;": u"\u0170", + "Ufr;": u"\U0001d518", + "Ugrave": u"\xd9", + "Ugrave;": u"\xd9", + "Umacr;": u"\u016a", + "UnderBar;": u"_", + "UnderBrace;": u"\u23df", + "UnderBracket;": u"\u23b5", + "UnderParenthesis;": u"\u23dd", + "Union;": u"\u22c3", + "UnionPlus;": u"\u228e", + "Uogon;": u"\u0172", + "Uopf;": u"\U0001d54c", + "UpArrow;": u"\u2191", + "UpArrowBar;": u"\u2912", + "UpArrowDownArrow;": u"\u21c5", + "UpDownArrow;": u"\u2195", + "UpEquilibrium;": u"\u296e", + "UpTee;": u"\u22a5", + "UpTeeArrow;": u"\u21a5", + "Uparrow;": u"\u21d1", + "Updownarrow;": u"\u21d5", + "UpperLeftArrow;": u"\u2196", + "UpperRightArrow;": u"\u2197", + "Upsi;": u"\u03d2", + "Upsilon;": u"\u03a5", + "Uring;": u"\u016e", + "Uscr;": u"\U0001d4b0", + "Utilde;": u"\u0168", + "Uuml": u"\xdc", + "Uuml;": u"\xdc", + "VDash;": u"\u22ab", + "Vbar;": u"\u2aeb", + "Vcy;": u"\u0412", + "Vdash;": u"\u22a9", + "Vdashl;": u"\u2ae6", + "Vee;": u"\u22c1", + "Verbar;": u"\u2016", + "Vert;": u"\u2016", + "VerticalBar;": u"\u2223", + "VerticalLine;": u"|", + "VerticalSeparator;": u"\u2758", + "VerticalTilde;": u"\u2240", + "VeryThinSpace;": u"\u200a", + "Vfr;": u"\U0001d519", + "Vopf;": u"\U0001d54d", + "Vscr;": u"\U0001d4b1", + "Vvdash;": u"\u22aa", + "Wcirc;": u"\u0174", + "Wedge;": u"\u22c0", + "Wfr;": u"\U0001d51a", + "Wopf;": u"\U0001d54e", + "Wscr;": u"\U0001d4b2", + "Xfr;": u"\U0001d51b", + "Xi;": u"\u039e", + "Xopf;": u"\U0001d54f", + "Xscr;": u"\U0001d4b3", + "YAcy;": u"\u042f", + "YIcy;": u"\u0407", + "YUcy;": u"\u042e", + "Yacute": u"\xdd", + "Yacute;": u"\xdd", + "Ycirc;": u"\u0176", + "Ycy;": u"\u042b", + "Yfr;": u"\U0001d51c", + "Yopf;": u"\U0001d550", + "Yscr;": u"\U0001d4b4", + "Yuml;": u"\u0178", + "ZHcy;": u"\u0416", + "Zacute;": u"\u0179", + "Zcaron;": u"\u017d", + "Zcy;": u"\u0417", + "Zdot;": u"\u017b", + "ZeroWidthSpace;": u"\u200b", + "Zeta;": u"\u0396", + "Zfr;": u"\u2128", + "Zopf;": u"\u2124", + "Zscr;": u"\U0001d4b5", + "aacute": u"\xe1", + "aacute;": u"\xe1", + "abreve;": u"\u0103", + "ac;": u"\u223e", + "acE;": u"\u223e\u0333", + "acd;": u"\u223f", + "acirc": u"\xe2", + "acirc;": u"\xe2", + "acute": u"\xb4", + "acute;": u"\xb4", + "acy;": u"\u0430", + "aelig": u"\xe6", + "aelig;": u"\xe6", + "af;": u"\u2061", + "afr;": u"\U0001d51e", + "agrave": u"\xe0", + "agrave;": u"\xe0", + "alefsym;": u"\u2135", + "aleph;": u"\u2135", + "alpha;": u"\u03b1", + "amacr;": u"\u0101", + "amalg;": u"\u2a3f", + "amp": u"&", + "amp;": u"&", + "and;": u"\u2227", + "andand;": u"\u2a55", + "andd;": u"\u2a5c", + "andslope;": u"\u2a58", + "andv;": u"\u2a5a", + "ang;": u"\u2220", + "ange;": u"\u29a4", + "angle;": u"\u2220", + "angmsd;": u"\u2221", + "angmsdaa;": u"\u29a8", + "angmsdab;": u"\u29a9", + "angmsdac;": u"\u29aa", + "angmsdad;": u"\u29ab", + "angmsdae;": u"\u29ac", + "angmsdaf;": u"\u29ad", + "angmsdag;": u"\u29ae", + "angmsdah;": u"\u29af", + "angrt;": u"\u221f", + "angrtvb;": u"\u22be", + "angrtvbd;": u"\u299d", + "angsph;": u"\u2222", + "angst;": u"\xc5", + "angzarr;": u"\u237c", + "aogon;": u"\u0105", + "aopf;": u"\U0001d552", + "ap;": u"\u2248", + "apE;": u"\u2a70", + "apacir;": u"\u2a6f", + "ape;": u"\u224a", + "apid;": u"\u224b", + "apos;": u"'", + "approx;": u"\u2248", + "approxeq;": u"\u224a", + "aring": u"\xe5", + "aring;": u"\xe5", + "ascr;": u"\U0001d4b6", + "ast;": u"*", + "asymp;": u"\u2248", + "asympeq;": u"\u224d", + "atilde": u"\xe3", + "atilde;": u"\xe3", + "auml": u"\xe4", + "auml;": u"\xe4", + "awconint;": u"\u2233", + "awint;": u"\u2a11", + "bNot;": u"\u2aed", + "backcong;": u"\u224c", + "backepsilon;": u"\u03f6", + "backprime;": u"\u2035", + "backsim;": u"\u223d", + "backsimeq;": u"\u22cd", + "barvee;": u"\u22bd", + "barwed;": u"\u2305", + "barwedge;": u"\u2305", + "bbrk;": u"\u23b5", + "bbrktbrk;": u"\u23b6", + "bcong;": u"\u224c", + "bcy;": u"\u0431", + "bdquo;": u"\u201e", + "becaus;": u"\u2235", + "because;": u"\u2235", + "bemptyv;": u"\u29b0", + "bepsi;": u"\u03f6", + "bernou;": u"\u212c", + "beta;": u"\u03b2", + "beth;": u"\u2136", + "between;": u"\u226c", + "bfr;": u"\U0001d51f", + "bigcap;": u"\u22c2", + "bigcirc;": u"\u25ef", + "bigcup;": u"\u22c3", + "bigodot;": u"\u2a00", + "bigoplus;": u"\u2a01", + "bigotimes;": u"\u2a02", + "bigsqcup;": u"\u2a06", + "bigstar;": u"\u2605", + "bigtriangledown;": u"\u25bd", + "bigtriangleup;": u"\u25b3", + "biguplus;": u"\u2a04", + "bigvee;": u"\u22c1", + "bigwedge;": u"\u22c0", + "bkarow;": u"\u290d", + "blacklozenge;": u"\u29eb", + "blacksquare;": u"\u25aa", + "blacktriangle;": u"\u25b4", + "blacktriangledown;": u"\u25be", + "blacktriangleleft;": u"\u25c2", + "blacktriangleright;": u"\u25b8", + "blank;": u"\u2423", + "blk12;": u"\u2592", + "blk14;": u"\u2591", + "blk34;": u"\u2593", + "block;": u"\u2588", + "bne;": u"=\u20e5", + "bnequiv;": u"\u2261\u20e5", + "bnot;": u"\u2310", + "bopf;": u"\U0001d553", + "bot;": u"\u22a5", + "bottom;": u"\u22a5", + "bowtie;": u"\u22c8", + "boxDL;": u"\u2557", + "boxDR;": u"\u2554", + "boxDl;": u"\u2556", + "boxDr;": u"\u2553", + "boxH;": u"\u2550", + "boxHD;": u"\u2566", + "boxHU;": u"\u2569", + "boxHd;": u"\u2564", + "boxHu;": u"\u2567", + "boxUL;": u"\u255d", + "boxUR;": u"\u255a", + "boxUl;": u"\u255c", + "boxUr;": u"\u2559", + "boxV;": u"\u2551", + "boxVH;": u"\u256c", + "boxVL;": u"\u2563", + "boxVR;": u"\u2560", + "boxVh;": u"\u256b", + "boxVl;": u"\u2562", + "boxVr;": u"\u255f", + "boxbox;": u"\u29c9", + "boxdL;": u"\u2555", + "boxdR;": u"\u2552", + "boxdl;": u"\u2510", + "boxdr;": u"\u250c", + "boxh;": u"\u2500", + "boxhD;": u"\u2565", + "boxhU;": u"\u2568", + "boxhd;": u"\u252c", + "boxhu;": u"\u2534", + "boxminus;": u"\u229f", + "boxplus;": u"\u229e", + "boxtimes;": u"\u22a0", + "boxuL;": u"\u255b", + "boxuR;": u"\u2558", + "boxul;": u"\u2518", + "boxur;": u"\u2514", + "boxv;": u"\u2502", + "boxvH;": u"\u256a", + "boxvL;": u"\u2561", + "boxvR;": u"\u255e", + "boxvh;": u"\u253c", + "boxvl;": u"\u2524", + "boxvr;": u"\u251c", + "bprime;": u"\u2035", + "breve;": u"\u02d8", + "brvbar": u"\xa6", + "brvbar;": u"\xa6", + "bscr;": u"\U0001d4b7", + "bsemi;": u"\u204f", + "bsim;": u"\u223d", + "bsime;": u"\u22cd", + "bsol;": u"\\", + "bsolb;": u"\u29c5", + "bsolhsub;": u"\u27c8", + "bull;": u"\u2022", + "bullet;": u"\u2022", + "bump;": u"\u224e", + "bumpE;": u"\u2aae", + "bumpe;": u"\u224f", + "bumpeq;": u"\u224f", + "cacute;": u"\u0107", + "cap;": u"\u2229", + "capand;": u"\u2a44", + "capbrcup;": u"\u2a49", + "capcap;": u"\u2a4b", + "capcup;": u"\u2a47", + "capdot;": u"\u2a40", + "caps;": u"\u2229\ufe00", + "caret;": u"\u2041", + "caron;": u"\u02c7", + "ccaps;": u"\u2a4d", + "ccaron;": u"\u010d", + "ccedil": u"\xe7", + "ccedil;": u"\xe7", + "ccirc;": u"\u0109", + "ccups;": u"\u2a4c", + "ccupssm;": u"\u2a50", + "cdot;": u"\u010b", + "cedil": u"\xb8", + "cedil;": u"\xb8", + "cemptyv;": u"\u29b2", + "cent": u"\xa2", + "cent;": u"\xa2", + "centerdot;": u"\xb7", + "cfr;": u"\U0001d520", + "chcy;": u"\u0447", + "check;": u"\u2713", + "checkmark;": u"\u2713", + "chi;": u"\u03c7", + "cir;": u"\u25cb", + "cirE;": u"\u29c3", + "circ;": u"\u02c6", + "circeq;": u"\u2257", + "circlearrowleft;": u"\u21ba", + "circlearrowright;": u"\u21bb", + "circledR;": u"\xae", + "circledS;": u"\u24c8", + "circledast;": u"\u229b", + "circledcirc;": u"\u229a", + "circleddash;": u"\u229d", + "cire;": u"\u2257", + "cirfnint;": u"\u2a10", + "cirmid;": u"\u2aef", + "cirscir;": u"\u29c2", + "clubs;": u"\u2663", + "clubsuit;": u"\u2663", + "colon;": u":", + "colone;": u"\u2254", + "coloneq;": u"\u2254", + "comma;": u",", + "commat;": u"@", + "comp;": u"\u2201", + "compfn;": u"\u2218", + "complement;": u"\u2201", + "complexes;": u"\u2102", + "cong;": u"\u2245", + "congdot;": u"\u2a6d", + "conint;": u"\u222e", + "copf;": u"\U0001d554", + "coprod;": u"\u2210", + "copy": u"\xa9", + "copy;": u"\xa9", + "copysr;": u"\u2117", + "crarr;": u"\u21b5", + "cross;": u"\u2717", + "cscr;": u"\U0001d4b8", + "csub;": u"\u2acf", + "csube;": u"\u2ad1", + "csup;": u"\u2ad0", + "csupe;": u"\u2ad2", + "ctdot;": u"\u22ef", + "cudarrl;": u"\u2938", + "cudarrr;": u"\u2935", + "cuepr;": u"\u22de", + "cuesc;": u"\u22df", + "cularr;": u"\u21b6", + "cularrp;": u"\u293d", + "cup;": u"\u222a", + "cupbrcap;": u"\u2a48", + "cupcap;": u"\u2a46", + "cupcup;": u"\u2a4a", + "cupdot;": u"\u228d", + "cupor;": u"\u2a45", + "cups;": u"\u222a\ufe00", + "curarr;": u"\u21b7", + "curarrm;": u"\u293c", + "curlyeqprec;": u"\u22de", + "curlyeqsucc;": u"\u22df", + "curlyvee;": u"\u22ce", + "curlywedge;": u"\u22cf", + "curren": u"\xa4", + "curren;": u"\xa4", + "curvearrowleft;": u"\u21b6", + "curvearrowright;": u"\u21b7", + "cuvee;": u"\u22ce", + "cuwed;": u"\u22cf", + "cwconint;": u"\u2232", + "cwint;": u"\u2231", + "cylcty;": u"\u232d", + "dArr;": u"\u21d3", + "dHar;": u"\u2965", + "dagger;": u"\u2020", + "daleth;": u"\u2138", + "darr;": u"\u2193", + "dash;": u"\u2010", + "dashv;": u"\u22a3", + "dbkarow;": u"\u290f", + "dblac;": u"\u02dd", + "dcaron;": u"\u010f", + "dcy;": u"\u0434", + "dd;": u"\u2146", + "ddagger;": u"\u2021", + "ddarr;": u"\u21ca", + "ddotseq;": u"\u2a77", + "deg": u"\xb0", + "deg;": u"\xb0", + "delta;": u"\u03b4", + "demptyv;": u"\u29b1", + "dfisht;": u"\u297f", + "dfr;": u"\U0001d521", + "dharl;": u"\u21c3", + "dharr;": u"\u21c2", + "diam;": u"\u22c4", + "diamond;": u"\u22c4", + "diamondsuit;": u"\u2666", + "diams;": u"\u2666", + "die;": u"\xa8", + "digamma;": u"\u03dd", + "disin;": u"\u22f2", + "div;": u"\xf7", + "divide": u"\xf7", + "divide;": u"\xf7", + "divideontimes;": u"\u22c7", + "divonx;": u"\u22c7", + "djcy;": u"\u0452", + "dlcorn;": u"\u231e", + "dlcrop;": u"\u230d", + "dollar;": u"$", + "dopf;": u"\U0001d555", + "dot;": u"\u02d9", + "doteq;": u"\u2250", + "doteqdot;": u"\u2251", + "dotminus;": u"\u2238", + "dotplus;": u"\u2214", + "dotsquare;": u"\u22a1", + "doublebarwedge;": u"\u2306", + "downarrow;": u"\u2193", + "downdownarrows;": u"\u21ca", + "downharpoonleft;": u"\u21c3", + "downharpoonright;": u"\u21c2", + "drbkarow;": u"\u2910", + "drcorn;": u"\u231f", + "drcrop;": u"\u230c", + "dscr;": u"\U0001d4b9", + "dscy;": u"\u0455", + "dsol;": u"\u29f6", + "dstrok;": u"\u0111", + "dtdot;": u"\u22f1", + "dtri;": u"\u25bf", + "dtrif;": u"\u25be", + "duarr;": u"\u21f5", + "duhar;": u"\u296f", + "dwangle;": u"\u29a6", + "dzcy;": u"\u045f", + "dzigrarr;": u"\u27ff", + "eDDot;": u"\u2a77", + "eDot;": u"\u2251", + "eacute": u"\xe9", + "eacute;": u"\xe9", + "easter;": u"\u2a6e", + "ecaron;": u"\u011b", + "ecir;": u"\u2256", + "ecirc": u"\xea", + "ecirc;": u"\xea", + "ecolon;": u"\u2255", + "ecy;": u"\u044d", + "edot;": u"\u0117", + "ee;": u"\u2147", + "efDot;": u"\u2252", + "efr;": u"\U0001d522", + "eg;": u"\u2a9a", + "egrave": u"\xe8", + "egrave;": u"\xe8", + "egs;": u"\u2a96", + "egsdot;": u"\u2a98", + "el;": u"\u2a99", + "elinters;": u"\u23e7", + "ell;": u"\u2113", + "els;": u"\u2a95", + "elsdot;": u"\u2a97", + "emacr;": u"\u0113", + "empty;": u"\u2205", + "emptyset;": u"\u2205", + "emptyv;": u"\u2205", + "emsp13;": u"\u2004", + "emsp14;": u"\u2005", + "emsp;": u"\u2003", + "eng;": u"\u014b", + "ensp;": u"\u2002", + "eogon;": u"\u0119", + "eopf;": u"\U0001d556", + "epar;": u"\u22d5", + "eparsl;": u"\u29e3", + "eplus;": u"\u2a71", + "epsi;": u"\u03b5", + "epsilon;": u"\u03b5", + "epsiv;": u"\u03f5", + "eqcirc;": u"\u2256", + "eqcolon;": u"\u2255", + "eqsim;": u"\u2242", + "eqslantgtr;": u"\u2a96", + "eqslantless;": u"\u2a95", + "equals;": u"=", + "equest;": u"\u225f", + "equiv;": u"\u2261", + "equivDD;": u"\u2a78", + "eqvparsl;": u"\u29e5", + "erDot;": u"\u2253", + "erarr;": u"\u2971", + "escr;": u"\u212f", + "esdot;": u"\u2250", + "esim;": u"\u2242", + "eta;": u"\u03b7", + "eth": u"\xf0", + "eth;": u"\xf0", + "euml": u"\xeb", + "euml;": u"\xeb", + "euro;": u"\u20ac", + "excl;": u"!", + "exist;": u"\u2203", + "expectation;": u"\u2130", + "exponentiale;": u"\u2147", + "fallingdotseq;": u"\u2252", + "fcy;": u"\u0444", + "female;": u"\u2640", + "ffilig;": u"\ufb03", + "fflig;": u"\ufb00", + "ffllig;": u"\ufb04", + "ffr;": u"\U0001d523", + "filig;": u"\ufb01", + "fjlig;": u"fj", + "flat;": u"\u266d", + "fllig;": u"\ufb02", + "fltns;": u"\u25b1", + "fnof;": u"\u0192", + "fopf;": u"\U0001d557", + "forall;": u"\u2200", + "fork;": u"\u22d4", + "forkv;": u"\u2ad9", + "fpartint;": u"\u2a0d", + "frac12": u"\xbd", + "frac12;": u"\xbd", + "frac13;": u"\u2153", + "frac14": u"\xbc", + "frac14;": u"\xbc", + "frac15;": u"\u2155", + "frac16;": u"\u2159", + "frac18;": u"\u215b", + "frac23;": u"\u2154", + "frac25;": u"\u2156", + "frac34": u"\xbe", + "frac34;": u"\xbe", + "frac35;": u"\u2157", + "frac38;": u"\u215c", + "frac45;": u"\u2158", + "frac56;": u"\u215a", + "frac58;": u"\u215d", + "frac78;": u"\u215e", + "frasl;": u"\u2044", + "frown;": u"\u2322", + "fscr;": u"\U0001d4bb", + "gE;": u"\u2267", + "gEl;": u"\u2a8c", + "gacute;": u"\u01f5", + "gamma;": u"\u03b3", + "gammad;": u"\u03dd", + "gap;": u"\u2a86", + "gbreve;": u"\u011f", + "gcirc;": u"\u011d", + "gcy;": u"\u0433", + "gdot;": u"\u0121", + "ge;": u"\u2265", + "gel;": u"\u22db", + "geq;": u"\u2265", + "geqq;": u"\u2267", + "geqslant;": u"\u2a7e", + "ges;": u"\u2a7e", + "gescc;": u"\u2aa9", + "gesdot;": u"\u2a80", + "gesdoto;": u"\u2a82", + "gesdotol;": u"\u2a84", + "gesl;": u"\u22db\ufe00", + "gesles;": u"\u2a94", + "gfr;": u"\U0001d524", + "gg;": u"\u226b", + "ggg;": u"\u22d9", + "gimel;": u"\u2137", + "gjcy;": u"\u0453", + "gl;": u"\u2277", + "glE;": u"\u2a92", + "gla;": u"\u2aa5", + "glj;": u"\u2aa4", + "gnE;": u"\u2269", + "gnap;": u"\u2a8a", + "gnapprox;": u"\u2a8a", + "gne;": u"\u2a88", + "gneq;": u"\u2a88", + "gneqq;": u"\u2269", + "gnsim;": u"\u22e7", + "gopf;": u"\U0001d558", + "grave;": u"`", + "gscr;": u"\u210a", + "gsim;": u"\u2273", + "gsime;": u"\u2a8e", + "gsiml;": u"\u2a90", + "gt": u">", + "gt;": u">", + "gtcc;": u"\u2aa7", + "gtcir;": u"\u2a7a", + "gtdot;": u"\u22d7", + "gtlPar;": u"\u2995", + "gtquest;": u"\u2a7c", + "gtrapprox;": u"\u2a86", + "gtrarr;": u"\u2978", + "gtrdot;": u"\u22d7", + "gtreqless;": u"\u22db", + "gtreqqless;": u"\u2a8c", + "gtrless;": u"\u2277", + "gtrsim;": u"\u2273", + "gvertneqq;": u"\u2269\ufe00", + "gvnE;": u"\u2269\ufe00", + "hArr;": u"\u21d4", + "hairsp;": u"\u200a", + "half;": u"\xbd", + "hamilt;": u"\u210b", + "hardcy;": u"\u044a", + "harr;": u"\u2194", + "harrcir;": u"\u2948", + "harrw;": u"\u21ad", + "hbar;": u"\u210f", + "hcirc;": u"\u0125", + "hearts;": u"\u2665", + "heartsuit;": u"\u2665", + "hellip;": u"\u2026", + "hercon;": u"\u22b9", + "hfr;": u"\U0001d525", + "hksearow;": u"\u2925", + "hkswarow;": u"\u2926", + "hoarr;": u"\u21ff", + "homtht;": u"\u223b", + "hookleftarrow;": u"\u21a9", + "hookrightarrow;": u"\u21aa", + "hopf;": u"\U0001d559", + "horbar;": u"\u2015", + "hscr;": u"\U0001d4bd", + "hslash;": u"\u210f", + "hstrok;": u"\u0127", + "hybull;": u"\u2043", + "hyphen;": u"\u2010", + "iacute": u"\xed", + "iacute;": u"\xed", + "ic;": u"\u2063", + "icirc": u"\xee", + "icirc;": u"\xee", + "icy;": u"\u0438", + "iecy;": u"\u0435", + "iexcl": u"\xa1", + "iexcl;": u"\xa1", + "iff;": u"\u21d4", + "ifr;": u"\U0001d526", + "igrave": u"\xec", + "igrave;": u"\xec", + "ii;": u"\u2148", + "iiiint;": u"\u2a0c", + "iiint;": u"\u222d", + "iinfin;": u"\u29dc", + "iiota;": u"\u2129", + "ijlig;": u"\u0133", + "imacr;": u"\u012b", + "image;": u"\u2111", + "imagline;": u"\u2110", + "imagpart;": u"\u2111", + "imath;": u"\u0131", + "imof;": u"\u22b7", + "imped;": u"\u01b5", + "in;": u"\u2208", + "incare;": u"\u2105", + "infin;": u"\u221e", + "infintie;": u"\u29dd", + "inodot;": u"\u0131", + "int;": u"\u222b", + "intcal;": u"\u22ba", + "integers;": u"\u2124", + "intercal;": u"\u22ba", + "intlarhk;": u"\u2a17", + "intprod;": u"\u2a3c", + "iocy;": u"\u0451", + "iogon;": u"\u012f", + "iopf;": u"\U0001d55a", + "iota;": u"\u03b9", + "iprod;": u"\u2a3c", + "iquest": u"\xbf", + "iquest;": u"\xbf", + "iscr;": u"\U0001d4be", + "isin;": u"\u2208", + "isinE;": u"\u22f9", + "isindot;": u"\u22f5", + "isins;": u"\u22f4", + "isinsv;": u"\u22f3", + "isinv;": u"\u2208", + "it;": u"\u2062", + "itilde;": u"\u0129", + "iukcy;": u"\u0456", + "iuml": u"\xef", + "iuml;": u"\xef", + "jcirc;": u"\u0135", + "jcy;": u"\u0439", + "jfr;": u"\U0001d527", + "jmath;": u"\u0237", + "jopf;": u"\U0001d55b", + "jscr;": u"\U0001d4bf", + "jsercy;": u"\u0458", + "jukcy;": u"\u0454", + "kappa;": u"\u03ba", + "kappav;": u"\u03f0", + "kcedil;": u"\u0137", + "kcy;": u"\u043a", + "kfr;": u"\U0001d528", + "kgreen;": u"\u0138", + "khcy;": u"\u0445", + "kjcy;": u"\u045c", + "kopf;": u"\U0001d55c", + "kscr;": u"\U0001d4c0", + "lAarr;": u"\u21da", + "lArr;": u"\u21d0", + "lAtail;": u"\u291b", + "lBarr;": u"\u290e", + "lE;": u"\u2266", + "lEg;": u"\u2a8b", + "lHar;": u"\u2962", + "lacute;": u"\u013a", + "laemptyv;": u"\u29b4", + "lagran;": u"\u2112", + "lambda;": u"\u03bb", + "lang;": u"\u27e8", + "langd;": u"\u2991", + "langle;": u"\u27e8", + "lap;": u"\u2a85", + "laquo": u"\xab", + "laquo;": u"\xab", + "larr;": u"\u2190", + "larrb;": u"\u21e4", + "larrbfs;": u"\u291f", + "larrfs;": u"\u291d", + "larrhk;": u"\u21a9", + "larrlp;": u"\u21ab", + "larrpl;": u"\u2939", + "larrsim;": u"\u2973", + "larrtl;": u"\u21a2", + "lat;": u"\u2aab", + "latail;": u"\u2919", + "late;": u"\u2aad", + "lates;": u"\u2aad\ufe00", + "lbarr;": u"\u290c", + "lbbrk;": u"\u2772", + "lbrace;": u"{", + "lbrack;": u"[", + "lbrke;": u"\u298b", + "lbrksld;": u"\u298f", + "lbrkslu;": u"\u298d", + "lcaron;": u"\u013e", + "lcedil;": u"\u013c", + "lceil;": u"\u2308", + "lcub;": u"{", + "lcy;": u"\u043b", + "ldca;": u"\u2936", + "ldquo;": u"\u201c", + "ldquor;": u"\u201e", + "ldrdhar;": u"\u2967", + "ldrushar;": u"\u294b", + "ldsh;": u"\u21b2", + "le;": u"\u2264", + "leftarrow;": u"\u2190", + "leftarrowtail;": u"\u21a2", + "leftharpoondown;": u"\u21bd", + "leftharpoonup;": u"\u21bc", + "leftleftarrows;": u"\u21c7", + "leftrightarrow;": u"\u2194", + "leftrightarrows;": u"\u21c6", + "leftrightharpoons;": u"\u21cb", + "leftrightsquigarrow;": u"\u21ad", + "leftthreetimes;": u"\u22cb", + "leg;": u"\u22da", + "leq;": u"\u2264", + "leqq;": u"\u2266", + "leqslant;": u"\u2a7d", + "les;": u"\u2a7d", + "lescc;": u"\u2aa8", + "lesdot;": u"\u2a7f", + "lesdoto;": u"\u2a81", + "lesdotor;": u"\u2a83", + "lesg;": u"\u22da\ufe00", + "lesges;": u"\u2a93", + "lessapprox;": u"\u2a85", + "lessdot;": u"\u22d6", + "lesseqgtr;": u"\u22da", + "lesseqqgtr;": u"\u2a8b", + "lessgtr;": u"\u2276", + "lesssim;": u"\u2272", + "lfisht;": u"\u297c", + "lfloor;": u"\u230a", + "lfr;": u"\U0001d529", + "lg;": u"\u2276", + "lgE;": u"\u2a91", + "lhard;": u"\u21bd", + "lharu;": u"\u21bc", + "lharul;": u"\u296a", + "lhblk;": u"\u2584", + "ljcy;": u"\u0459", + "ll;": u"\u226a", + "llarr;": u"\u21c7", + "llcorner;": u"\u231e", + "llhard;": u"\u296b", + "lltri;": u"\u25fa", + "lmidot;": u"\u0140", + "lmoust;": u"\u23b0", + "lmoustache;": u"\u23b0", + "lnE;": u"\u2268", + "lnap;": u"\u2a89", + "lnapprox;": u"\u2a89", + "lne;": u"\u2a87", + "lneq;": u"\u2a87", + "lneqq;": u"\u2268", + "lnsim;": u"\u22e6", + "loang;": u"\u27ec", + "loarr;": u"\u21fd", + "lobrk;": u"\u27e6", + "longleftarrow;": u"\u27f5", + "longleftrightarrow;": u"\u27f7", + "longmapsto;": u"\u27fc", + "longrightarrow;": u"\u27f6", + "looparrowleft;": u"\u21ab", + "looparrowright;": u"\u21ac", + "lopar;": u"\u2985", + "lopf;": u"\U0001d55d", + "loplus;": u"\u2a2d", + "lotimes;": u"\u2a34", + "lowast;": u"\u2217", + "lowbar;": u"_", + "loz;": u"\u25ca", + "lozenge;": u"\u25ca", + "lozf;": u"\u29eb", + "lpar;": u"(", + "lparlt;": u"\u2993", + "lrarr;": u"\u21c6", + "lrcorner;": u"\u231f", + "lrhar;": u"\u21cb", + "lrhard;": u"\u296d", + "lrm;": u"\u200e", + "lrtri;": u"\u22bf", + "lsaquo;": u"\u2039", + "lscr;": u"\U0001d4c1", + "lsh;": u"\u21b0", + "lsim;": u"\u2272", + "lsime;": u"\u2a8d", + "lsimg;": u"\u2a8f", + "lsqb;": u"[", + "lsquo;": u"\u2018", + "lsquor;": u"\u201a", + "lstrok;": u"\u0142", + "lt": u"<", + "lt;": u"<", + "ltcc;": u"\u2aa6", + "ltcir;": u"\u2a79", + "ltdot;": u"\u22d6", + "lthree;": u"\u22cb", + "ltimes;": u"\u22c9", + "ltlarr;": u"\u2976", + "ltquest;": u"\u2a7b", + "ltrPar;": u"\u2996", + "ltri;": u"\u25c3", + "ltrie;": u"\u22b4", + "ltrif;": u"\u25c2", + "lurdshar;": u"\u294a", + "luruhar;": u"\u2966", + "lvertneqq;": u"\u2268\ufe00", + "lvnE;": u"\u2268\ufe00", + "mDDot;": u"\u223a", + "macr": u"\xaf", + "macr;": u"\xaf", + "male;": u"\u2642", + "malt;": u"\u2720", + "maltese;": u"\u2720", + "map;": u"\u21a6", + "mapsto;": u"\u21a6", + "mapstodown;": u"\u21a7", + "mapstoleft;": u"\u21a4", + "mapstoup;": u"\u21a5", + "marker;": u"\u25ae", + "mcomma;": u"\u2a29", + "mcy;": u"\u043c", + "mdash;": u"\u2014", + "measuredangle;": u"\u2221", + "mfr;": u"\U0001d52a", + "mho;": u"\u2127", + "micro": u"\xb5", + "micro;": u"\xb5", + "mid;": u"\u2223", + "midast;": u"*", + "midcir;": u"\u2af0", + "middot": u"\xb7", + "middot;": u"\xb7", + "minus;": u"\u2212", + "minusb;": u"\u229f", + "minusd;": u"\u2238", + "minusdu;": u"\u2a2a", + "mlcp;": u"\u2adb", + "mldr;": u"\u2026", + "mnplus;": u"\u2213", + "models;": u"\u22a7", + "mopf;": u"\U0001d55e", + "mp;": u"\u2213", + "mscr;": u"\U0001d4c2", + "mstpos;": u"\u223e", + "mu;": u"\u03bc", + "multimap;": u"\u22b8", + "mumap;": u"\u22b8", + "nGg;": u"\u22d9\u0338", + "nGt;": u"\u226b\u20d2", + "nGtv;": u"\u226b\u0338", + "nLeftarrow;": u"\u21cd", + "nLeftrightarrow;": u"\u21ce", + "nLl;": u"\u22d8\u0338", + "nLt;": u"\u226a\u20d2", + "nLtv;": u"\u226a\u0338", + "nRightarrow;": u"\u21cf", + "nVDash;": u"\u22af", + "nVdash;": u"\u22ae", + "nabla;": u"\u2207", + "nacute;": u"\u0144", + "nang;": u"\u2220\u20d2", + "nap;": u"\u2249", + "napE;": u"\u2a70\u0338", + "napid;": u"\u224b\u0338", + "napos;": u"\u0149", + "napprox;": u"\u2249", + "natur;": u"\u266e", + "natural;": u"\u266e", + "naturals;": u"\u2115", + "nbsp": u"\xa0", + "nbsp;": u"\xa0", + "nbump;": u"\u224e\u0338", + "nbumpe;": u"\u224f\u0338", + "ncap;": u"\u2a43", + "ncaron;": u"\u0148", + "ncedil;": u"\u0146", + "ncong;": u"\u2247", + "ncongdot;": u"\u2a6d\u0338", + "ncup;": u"\u2a42", + "ncy;": u"\u043d", + "ndash;": u"\u2013", + "ne;": u"\u2260", + "neArr;": u"\u21d7", + "nearhk;": u"\u2924", + "nearr;": u"\u2197", + "nearrow;": u"\u2197", + "nedot;": u"\u2250\u0338", + "nequiv;": u"\u2262", + "nesear;": u"\u2928", + "nesim;": u"\u2242\u0338", + "nexist;": u"\u2204", + "nexists;": u"\u2204", + "nfr;": u"\U0001d52b", + "ngE;": u"\u2267\u0338", + "nge;": u"\u2271", + "ngeq;": u"\u2271", + "ngeqq;": u"\u2267\u0338", + "ngeqslant;": u"\u2a7e\u0338", + "nges;": u"\u2a7e\u0338", + "ngsim;": u"\u2275", + "ngt;": u"\u226f", + "ngtr;": u"\u226f", + "nhArr;": u"\u21ce", + "nharr;": u"\u21ae", + "nhpar;": u"\u2af2", + "ni;": u"\u220b", + "nis;": u"\u22fc", + "nisd;": u"\u22fa", + "niv;": u"\u220b", + "njcy;": u"\u045a", + "nlArr;": u"\u21cd", + "nlE;": u"\u2266\u0338", + "nlarr;": u"\u219a", + "nldr;": u"\u2025", + "nle;": u"\u2270", + "nleftarrow;": u"\u219a", + "nleftrightarrow;": u"\u21ae", + "nleq;": u"\u2270", + "nleqq;": u"\u2266\u0338", + "nleqslant;": u"\u2a7d\u0338", + "nles;": u"\u2a7d\u0338", + "nless;": u"\u226e", + "nlsim;": u"\u2274", + "nlt;": u"\u226e", + "nltri;": u"\u22ea", + "nltrie;": u"\u22ec", + "nmid;": u"\u2224", + "nopf;": u"\U0001d55f", + "not": u"\xac", + "not;": u"\xac", + "notin;": u"\u2209", + "notinE;": u"\u22f9\u0338", + "notindot;": u"\u22f5\u0338", + "notinva;": u"\u2209", + "notinvb;": u"\u22f7", + "notinvc;": u"\u22f6", + "notni;": u"\u220c", + "notniva;": u"\u220c", + "notnivb;": u"\u22fe", + "notnivc;": u"\u22fd", + "npar;": u"\u2226", + "nparallel;": u"\u2226", + "nparsl;": u"\u2afd\u20e5", + "npart;": u"\u2202\u0338", + "npolint;": u"\u2a14", + "npr;": u"\u2280", + "nprcue;": u"\u22e0", + "npre;": u"\u2aaf\u0338", + "nprec;": u"\u2280", + "npreceq;": u"\u2aaf\u0338", + "nrArr;": u"\u21cf", + "nrarr;": u"\u219b", + "nrarrc;": u"\u2933\u0338", + "nrarrw;": u"\u219d\u0338", + "nrightarrow;": u"\u219b", + "nrtri;": u"\u22eb", + "nrtrie;": u"\u22ed", + "nsc;": u"\u2281", + "nsccue;": u"\u22e1", + "nsce;": u"\u2ab0\u0338", + "nscr;": u"\U0001d4c3", + "nshortmid;": u"\u2224", + "nshortparallel;": u"\u2226", + "nsim;": u"\u2241", + "nsime;": u"\u2244", + "nsimeq;": u"\u2244", + "nsmid;": u"\u2224", + "nspar;": u"\u2226", + "nsqsube;": u"\u22e2", + "nsqsupe;": u"\u22e3", + "nsub;": u"\u2284", + "nsubE;": u"\u2ac5\u0338", + "nsube;": u"\u2288", + "nsubset;": u"\u2282\u20d2", + "nsubseteq;": u"\u2288", + "nsubseteqq;": u"\u2ac5\u0338", + "nsucc;": u"\u2281", + "nsucceq;": u"\u2ab0\u0338", + "nsup;": u"\u2285", + "nsupE;": u"\u2ac6\u0338", + "nsupe;": u"\u2289", + "nsupset;": u"\u2283\u20d2", + "nsupseteq;": u"\u2289", + "nsupseteqq;": u"\u2ac6\u0338", + "ntgl;": u"\u2279", + "ntilde": u"\xf1", + "ntilde;": u"\xf1", + "ntlg;": u"\u2278", + "ntriangleleft;": u"\u22ea", + "ntrianglelefteq;": u"\u22ec", + "ntriangleright;": u"\u22eb", + "ntrianglerighteq;": u"\u22ed", + "nu;": u"\u03bd", + "num;": u"#", + "numero;": u"\u2116", + "numsp;": u"\u2007", + "nvDash;": u"\u22ad", + "nvHarr;": u"\u2904", + "nvap;": u"\u224d\u20d2", + "nvdash;": u"\u22ac", + "nvge;": u"\u2265\u20d2", + "nvgt;": u">\u20d2", + "nvinfin;": u"\u29de", + "nvlArr;": u"\u2902", + "nvle;": u"\u2264\u20d2", + "nvlt;": u"<\u20d2", + "nvltrie;": u"\u22b4\u20d2", + "nvrArr;": u"\u2903", + "nvrtrie;": u"\u22b5\u20d2", + "nvsim;": u"\u223c\u20d2", + "nwArr;": u"\u21d6", + "nwarhk;": u"\u2923", + "nwarr;": u"\u2196", + "nwarrow;": u"\u2196", + "nwnear;": u"\u2927", + "oS;": u"\u24c8", + "oacute": u"\xf3", + "oacute;": u"\xf3", + "oast;": u"\u229b", + "ocir;": u"\u229a", + "ocirc": u"\xf4", + "ocirc;": u"\xf4", + "ocy;": u"\u043e", + "odash;": u"\u229d", + "odblac;": u"\u0151", + "odiv;": u"\u2a38", + "odot;": u"\u2299", + "odsold;": u"\u29bc", + "oelig;": u"\u0153", + "ofcir;": u"\u29bf", + "ofr;": u"\U0001d52c", + "ogon;": u"\u02db", + "ograve": u"\xf2", + "ograve;": u"\xf2", + "ogt;": u"\u29c1", + "ohbar;": u"\u29b5", + "ohm;": u"\u03a9", + "oint;": u"\u222e", + "olarr;": u"\u21ba", + "olcir;": u"\u29be", + "olcross;": u"\u29bb", + "oline;": u"\u203e", + "olt;": u"\u29c0", + "omacr;": u"\u014d", + "omega;": u"\u03c9", + "omicron;": u"\u03bf", + "omid;": u"\u29b6", + "ominus;": u"\u2296", + "oopf;": u"\U0001d560", + "opar;": u"\u29b7", + "operp;": u"\u29b9", + "oplus;": u"\u2295", + "or;": u"\u2228", + "orarr;": u"\u21bb", + "ord;": u"\u2a5d", + "order;": u"\u2134", + "orderof;": u"\u2134", + "ordf": u"\xaa", + "ordf;": u"\xaa", + "ordm": u"\xba", + "ordm;": u"\xba", + "origof;": u"\u22b6", + "oror;": u"\u2a56", + "orslope;": u"\u2a57", + "orv;": u"\u2a5b", + "oscr;": u"\u2134", + "oslash": u"\xf8", + "oslash;": u"\xf8", + "osol;": u"\u2298", + "otilde": u"\xf5", + "otilde;": u"\xf5", + "otimes;": u"\u2297", + "otimesas;": u"\u2a36", + "ouml": u"\xf6", + "ouml;": u"\xf6", + "ovbar;": u"\u233d", + "par;": u"\u2225", + "para": u"\xb6", + "para;": u"\xb6", + "parallel;": u"\u2225", + "parsim;": u"\u2af3", + "parsl;": u"\u2afd", + "part;": u"\u2202", + "pcy;": u"\u043f", + "percnt;": u"%", + "period;": u".", + "permil;": u"\u2030", + "perp;": u"\u22a5", + "pertenk;": u"\u2031", + "pfr;": u"\U0001d52d", + "phi;": u"\u03c6", + "phiv;": u"\u03d5", + "phmmat;": u"\u2133", + "phone;": u"\u260e", + "pi;": u"\u03c0", + "pitchfork;": u"\u22d4", + "piv;": u"\u03d6", + "planck;": u"\u210f", + "planckh;": u"\u210e", + "plankv;": u"\u210f", + "plus;": u"+", + "plusacir;": u"\u2a23", + "plusb;": u"\u229e", + "pluscir;": u"\u2a22", + "plusdo;": u"\u2214", + "plusdu;": u"\u2a25", + "pluse;": u"\u2a72", + "plusmn": u"\xb1", + "plusmn;": u"\xb1", + "plussim;": u"\u2a26", + "plustwo;": u"\u2a27", + "pm;": u"\xb1", + "pointint;": u"\u2a15", + "popf;": u"\U0001d561", + "pound": u"\xa3", + "pound;": u"\xa3", + "pr;": u"\u227a", + "prE;": u"\u2ab3", + "prap;": u"\u2ab7", + "prcue;": u"\u227c", + "pre;": u"\u2aaf", + "prec;": u"\u227a", + "precapprox;": u"\u2ab7", + "preccurlyeq;": u"\u227c", + "preceq;": u"\u2aaf", + "precnapprox;": u"\u2ab9", + "precneqq;": u"\u2ab5", + "precnsim;": u"\u22e8", + "precsim;": u"\u227e", + "prime;": u"\u2032", + "primes;": u"\u2119", + "prnE;": u"\u2ab5", + "prnap;": u"\u2ab9", + "prnsim;": u"\u22e8", + "prod;": u"\u220f", + "profalar;": u"\u232e", + "profline;": u"\u2312", + "profsurf;": u"\u2313", + "prop;": u"\u221d", + "propto;": u"\u221d", + "prsim;": u"\u227e", + "prurel;": u"\u22b0", + "pscr;": u"\U0001d4c5", + "psi;": u"\u03c8", + "puncsp;": u"\u2008", + "qfr;": u"\U0001d52e", + "qint;": u"\u2a0c", + "qopf;": u"\U0001d562", + "qprime;": u"\u2057", + "qscr;": u"\U0001d4c6", + "quaternions;": u"\u210d", + "quatint;": u"\u2a16", + "quest;": u"?", + "questeq;": u"\u225f", + "quot": u"\"", + "quot;": u"\"", + "rAarr;": u"\u21db", + "rArr;": u"\u21d2", + "rAtail;": u"\u291c", + "rBarr;": u"\u290f", + "rHar;": u"\u2964", + "race;": u"\u223d\u0331", + "racute;": u"\u0155", + "radic;": u"\u221a", + "raemptyv;": u"\u29b3", + "rang;": u"\u27e9", + "rangd;": u"\u2992", + "range;": u"\u29a5", + "rangle;": u"\u27e9", + "raquo": u"\xbb", + "raquo;": u"\xbb", + "rarr;": u"\u2192", + "rarrap;": u"\u2975", + "rarrb;": u"\u21e5", + "rarrbfs;": u"\u2920", + "rarrc;": u"\u2933", + "rarrfs;": u"\u291e", + "rarrhk;": u"\u21aa", + "rarrlp;": u"\u21ac", + "rarrpl;": u"\u2945", + "rarrsim;": u"\u2974", + "rarrtl;": u"\u21a3", + "rarrw;": u"\u219d", + "ratail;": u"\u291a", + "ratio;": u"\u2236", + "rationals;": u"\u211a", + "rbarr;": u"\u290d", + "rbbrk;": u"\u2773", + "rbrace;": u"}", + "rbrack;": u"]", + "rbrke;": u"\u298c", + "rbrksld;": u"\u298e", + "rbrkslu;": u"\u2990", + "rcaron;": u"\u0159", + "rcedil;": u"\u0157", + "rceil;": u"\u2309", + "rcub;": u"}", + "rcy;": u"\u0440", + "rdca;": u"\u2937", + "rdldhar;": u"\u2969", + "rdquo;": u"\u201d", + "rdquor;": u"\u201d", + "rdsh;": u"\u21b3", + "real;": u"\u211c", + "realine;": u"\u211b", + "realpart;": u"\u211c", + "reals;": u"\u211d", + "rect;": u"\u25ad", + "reg": u"\xae", + "reg;": u"\xae", + "rfisht;": u"\u297d", + "rfloor;": u"\u230b", + "rfr;": u"\U0001d52f", + "rhard;": u"\u21c1", + "rharu;": u"\u21c0", + "rharul;": u"\u296c", + "rho;": u"\u03c1", + "rhov;": u"\u03f1", + "rightarrow;": u"\u2192", + "rightarrowtail;": u"\u21a3", + "rightharpoondown;": u"\u21c1", + "rightharpoonup;": u"\u21c0", + "rightleftarrows;": u"\u21c4", + "rightleftharpoons;": u"\u21cc", + "rightrightarrows;": u"\u21c9", + "rightsquigarrow;": u"\u219d", + "rightthreetimes;": u"\u22cc", + "ring;": u"\u02da", + "risingdotseq;": u"\u2253", + "rlarr;": u"\u21c4", + "rlhar;": u"\u21cc", + "rlm;": u"\u200f", + "rmoust;": u"\u23b1", + "rmoustache;": u"\u23b1", + "rnmid;": u"\u2aee", + "roang;": u"\u27ed", + "roarr;": u"\u21fe", + "robrk;": u"\u27e7", + "ropar;": u"\u2986", + "ropf;": u"\U0001d563", + "roplus;": u"\u2a2e", + "rotimes;": u"\u2a35", + "rpar;": u")", + "rpargt;": u"\u2994", + "rppolint;": u"\u2a12", + "rrarr;": u"\u21c9", + "rsaquo;": u"\u203a", + "rscr;": u"\U0001d4c7", + "rsh;": u"\u21b1", + "rsqb;": u"]", + "rsquo;": u"\u2019", + "rsquor;": u"\u2019", + "rthree;": u"\u22cc", + "rtimes;": u"\u22ca", + "rtri;": u"\u25b9", + "rtrie;": u"\u22b5", + "rtrif;": u"\u25b8", + "rtriltri;": u"\u29ce", + "ruluhar;": u"\u2968", + "rx;": u"\u211e", + "sacute;": u"\u015b", + "sbquo;": u"\u201a", + "sc;": u"\u227b", + "scE;": u"\u2ab4", + "scap;": u"\u2ab8", + "scaron;": u"\u0161", + "sccue;": u"\u227d", + "sce;": u"\u2ab0", + "scedil;": u"\u015f", + "scirc;": u"\u015d", + "scnE;": u"\u2ab6", + "scnap;": u"\u2aba", + "scnsim;": u"\u22e9", + "scpolint;": u"\u2a13", + "scsim;": u"\u227f", + "scy;": u"\u0441", + "sdot;": u"\u22c5", + "sdotb;": u"\u22a1", + "sdote;": u"\u2a66", + "seArr;": u"\u21d8", + "searhk;": u"\u2925", + "searr;": u"\u2198", + "searrow;": u"\u2198", + "sect": u"\xa7", + "sect;": u"\xa7", + "semi;": u";", + "seswar;": u"\u2929", + "setminus;": u"\u2216", + "setmn;": u"\u2216", + "sext;": u"\u2736", + "sfr;": u"\U0001d530", + "sfrown;": u"\u2322", + "sharp;": u"\u266f", + "shchcy;": u"\u0449", + "shcy;": u"\u0448", + "shortmid;": u"\u2223", + "shortparallel;": u"\u2225", + "shy": u"\xad", + "shy;": u"\xad", + "sigma;": u"\u03c3", + "sigmaf;": u"\u03c2", + "sigmav;": u"\u03c2", + "sim;": u"\u223c", + "simdot;": u"\u2a6a", + "sime;": u"\u2243", + "simeq;": u"\u2243", + "simg;": u"\u2a9e", + "simgE;": u"\u2aa0", + "siml;": u"\u2a9d", + "simlE;": u"\u2a9f", + "simne;": u"\u2246", + "simplus;": u"\u2a24", + "simrarr;": u"\u2972", + "slarr;": u"\u2190", + "smallsetminus;": u"\u2216", + "smashp;": u"\u2a33", + "smeparsl;": u"\u29e4", + "smid;": u"\u2223", + "smile;": u"\u2323", + "smt;": u"\u2aaa", + "smte;": u"\u2aac", + "smtes;": u"\u2aac\ufe00", + "softcy;": u"\u044c", + "sol;": u"/", + "solb;": u"\u29c4", + "solbar;": u"\u233f", + "sopf;": u"\U0001d564", + "spades;": u"\u2660", + "spadesuit;": u"\u2660", + "spar;": u"\u2225", + "sqcap;": u"\u2293", + "sqcaps;": u"\u2293\ufe00", + "sqcup;": u"\u2294", + "sqcups;": u"\u2294\ufe00", + "sqsub;": u"\u228f", + "sqsube;": u"\u2291", + "sqsubset;": u"\u228f", + "sqsubseteq;": u"\u2291", + "sqsup;": u"\u2290", + "sqsupe;": u"\u2292", + "sqsupset;": u"\u2290", + "sqsupseteq;": u"\u2292", + "squ;": u"\u25a1", + "square;": u"\u25a1", + "squarf;": u"\u25aa", + "squf;": u"\u25aa", + "srarr;": u"\u2192", + "sscr;": u"\U0001d4c8", + "ssetmn;": u"\u2216", + "ssmile;": u"\u2323", + "sstarf;": u"\u22c6", + "star;": u"\u2606", + "starf;": u"\u2605", + "straightepsilon;": u"\u03f5", + "straightphi;": u"\u03d5", + "strns;": u"\xaf", + "sub;": u"\u2282", + "subE;": u"\u2ac5", + "subdot;": u"\u2abd", + "sube;": u"\u2286", + "subedot;": u"\u2ac3", + "submult;": u"\u2ac1", + "subnE;": u"\u2acb", + "subne;": u"\u228a", + "subplus;": u"\u2abf", + "subrarr;": u"\u2979", + "subset;": u"\u2282", + "subseteq;": u"\u2286", + "subseteqq;": u"\u2ac5", + "subsetneq;": u"\u228a", + "subsetneqq;": u"\u2acb", + "subsim;": u"\u2ac7", + "subsub;": u"\u2ad5", + "subsup;": u"\u2ad3", + "succ;": u"\u227b", + "succapprox;": u"\u2ab8", + "succcurlyeq;": u"\u227d", + "succeq;": u"\u2ab0", + "succnapprox;": u"\u2aba", + "succneqq;": u"\u2ab6", + "succnsim;": u"\u22e9", + "succsim;": u"\u227f", + "sum;": u"\u2211", + "sung;": u"\u266a", + "sup1": u"\xb9", + "sup1;": u"\xb9", + "sup2": u"\xb2", + "sup2;": u"\xb2", + "sup3": u"\xb3", + "sup3;": u"\xb3", + "sup;": u"\u2283", + "supE;": u"\u2ac6", + "supdot;": u"\u2abe", + "supdsub;": u"\u2ad8", + "supe;": u"\u2287", + "supedot;": u"\u2ac4", + "suphsol;": u"\u27c9", + "suphsub;": u"\u2ad7", + "suplarr;": u"\u297b", + "supmult;": u"\u2ac2", + "supnE;": u"\u2acc", + "supne;": u"\u228b", + "supplus;": u"\u2ac0", + "supset;": u"\u2283", + "supseteq;": u"\u2287", + "supseteqq;": u"\u2ac6", + "supsetneq;": u"\u228b", + "supsetneqq;": u"\u2acc", + "supsim;": u"\u2ac8", + "supsub;": u"\u2ad4", + "supsup;": u"\u2ad6", + "swArr;": u"\u21d9", + "swarhk;": u"\u2926", + "swarr;": u"\u2199", + "swarrow;": u"\u2199", + "swnwar;": u"\u292a", + "szlig": u"\xdf", + "szlig;": u"\xdf", + "target;": u"\u2316", + "tau;": u"\u03c4", + "tbrk;": u"\u23b4", + "tcaron;": u"\u0165", + "tcedil;": u"\u0163", + "tcy;": u"\u0442", + "tdot;": u"\u20db", + "telrec;": u"\u2315", + "tfr;": u"\U0001d531", + "there4;": u"\u2234", + "therefore;": u"\u2234", + "theta;": u"\u03b8", + "thetasym;": u"\u03d1", + "thetav;": u"\u03d1", + "thickapprox;": u"\u2248", + "thicksim;": u"\u223c", + "thinsp;": u"\u2009", + "thkap;": u"\u2248", + "thksim;": u"\u223c", + "thorn": u"\xfe", + "thorn;": u"\xfe", + "tilde;": u"\u02dc", + "times": u"\xd7", + "times;": u"\xd7", + "timesb;": u"\u22a0", + "timesbar;": u"\u2a31", + "timesd;": u"\u2a30", + "tint;": u"\u222d", + "toea;": u"\u2928", + "top;": u"\u22a4", + "topbot;": u"\u2336", + "topcir;": u"\u2af1", + "topf;": u"\U0001d565", + "topfork;": u"\u2ada", + "tosa;": u"\u2929", + "tprime;": u"\u2034", + "trade;": u"\u2122", + "triangle;": u"\u25b5", + "triangledown;": u"\u25bf", + "triangleleft;": u"\u25c3", + "trianglelefteq;": u"\u22b4", + "triangleq;": u"\u225c", + "triangleright;": u"\u25b9", + "trianglerighteq;": u"\u22b5", + "tridot;": u"\u25ec", + "trie;": u"\u225c", + "triminus;": u"\u2a3a", + "triplus;": u"\u2a39", + "trisb;": u"\u29cd", + "tritime;": u"\u2a3b", + "trpezium;": u"\u23e2", + "tscr;": u"\U0001d4c9", + "tscy;": u"\u0446", + "tshcy;": u"\u045b", + "tstrok;": u"\u0167", + "twixt;": u"\u226c", + "twoheadleftarrow;": u"\u219e", + "twoheadrightarrow;": u"\u21a0", + "uArr;": u"\u21d1", + "uHar;": u"\u2963", + "uacute": u"\xfa", + "uacute;": u"\xfa", + "uarr;": u"\u2191", + "ubrcy;": u"\u045e", + "ubreve;": u"\u016d", + "ucirc": u"\xfb", + "ucirc;": u"\xfb", + "ucy;": u"\u0443", + "udarr;": u"\u21c5", + "udblac;": u"\u0171", + "udhar;": u"\u296e", + "ufisht;": u"\u297e", + "ufr;": u"\U0001d532", + "ugrave": u"\xf9", + "ugrave;": u"\xf9", + "uharl;": u"\u21bf", + "uharr;": u"\u21be", + "uhblk;": u"\u2580", + "ulcorn;": u"\u231c", + "ulcorner;": u"\u231c", + "ulcrop;": u"\u230f", + "ultri;": u"\u25f8", + "umacr;": u"\u016b", + "uml": u"\xa8", + "uml;": u"\xa8", + "uogon;": u"\u0173", + "uopf;": u"\U0001d566", + "uparrow;": u"\u2191", + "updownarrow;": u"\u2195", + "upharpoonleft;": u"\u21bf", + "upharpoonright;": u"\u21be", + "uplus;": u"\u228e", + "upsi;": u"\u03c5", + "upsih;": u"\u03d2", + "upsilon;": u"\u03c5", + "upuparrows;": u"\u21c8", + "urcorn;": u"\u231d", + "urcorner;": u"\u231d", + "urcrop;": u"\u230e", + "uring;": u"\u016f", + "urtri;": u"\u25f9", + "uscr;": u"\U0001d4ca", + "utdot;": u"\u22f0", + "utilde;": u"\u0169", + "utri;": u"\u25b5", + "utrif;": u"\u25b4", + "uuarr;": u"\u21c8", + "uuml": u"\xfc", + "uuml;": u"\xfc", + "uwangle;": u"\u29a7", + "vArr;": u"\u21d5", + "vBar;": u"\u2ae8", + "vBarv;": u"\u2ae9", + "vDash;": u"\u22a8", + "vangrt;": u"\u299c", + "varepsilon;": u"\u03f5", + "varkappa;": u"\u03f0", + "varnothing;": u"\u2205", + "varphi;": u"\u03d5", + "varpi;": u"\u03d6", + "varpropto;": u"\u221d", + "varr;": u"\u2195", + "varrho;": u"\u03f1", + "varsigma;": u"\u03c2", + "varsubsetneq;": u"\u228a\ufe00", + "varsubsetneqq;": u"\u2acb\ufe00", + "varsupsetneq;": u"\u228b\ufe00", + "varsupsetneqq;": u"\u2acc\ufe00", + "vartheta;": u"\u03d1", + "vartriangleleft;": u"\u22b2", + "vartriangleright;": u"\u22b3", + "vcy;": u"\u0432", + "vdash;": u"\u22a2", + "vee;": u"\u2228", + "veebar;": u"\u22bb", + "veeeq;": u"\u225a", + "vellip;": u"\u22ee", + "verbar;": u"|", + "vert;": u"|", + "vfr;": u"\U0001d533", + "vltri;": u"\u22b2", + "vnsub;": u"\u2282\u20d2", + "vnsup;": u"\u2283\u20d2", + "vopf;": u"\U0001d567", + "vprop;": u"\u221d", + "vrtri;": u"\u22b3", + "vscr;": u"\U0001d4cb", + "vsubnE;": u"\u2acb\ufe00", + "vsubne;": u"\u228a\ufe00", + "vsupnE;": u"\u2acc\ufe00", + "vsupne;": u"\u228b\ufe00", + "vzigzag;": u"\u299a", + "wcirc;": u"\u0175", + "wedbar;": u"\u2a5f", + "wedge;": u"\u2227", + "wedgeq;": u"\u2259", + "weierp;": u"\u2118", + "wfr;": u"\U0001d534", + "wopf;": u"\U0001d568", + "wp;": u"\u2118", + "wr;": u"\u2240", + "wreath;": u"\u2240", + "wscr;": u"\U0001d4cc", + "xcap;": u"\u22c2", + "xcirc;": u"\u25ef", + "xcup;": u"\u22c3", + "xdtri;": u"\u25bd", + "xfr;": u"\U0001d535", + "xhArr;": u"\u27fa", + "xharr;": u"\u27f7", + "xi;": u"\u03be", + "xlArr;": u"\u27f8", + "xlarr;": u"\u27f5", + "xmap;": u"\u27fc", + "xnis;": u"\u22fb", + "xodot;": u"\u2a00", + "xopf;": u"\U0001d569", + "xoplus;": u"\u2a01", + "xotime;": u"\u2a02", + "xrArr;": u"\u27f9", + "xrarr;": u"\u27f6", + "xscr;": u"\U0001d4cd", + "xsqcup;": u"\u2a06", + "xuplus;": u"\u2a04", + "xutri;": u"\u25b3", + "xvee;": u"\u22c1", + "xwedge;": u"\u22c0", + "yacute": u"\xfd", + "yacute;": u"\xfd", + "yacy;": u"\u044f", + "ycirc;": u"\u0177", + "ycy;": u"\u044b", + "yen": u"\xa5", + "yen;": u"\xa5", + "yfr;": u"\U0001d536", + "yicy;": u"\u0457", + "yopf;": u"\U0001d56a", + "yscr;": u"\U0001d4ce", + "yucy;": u"\u044e", + "yuml": u"\xff", + "yuml;": u"\xff", + "zacute;": u"\u017a", + "zcaron;": u"\u017e", + "zcy;": u"\u0437", + "zdot;": u"\u017c", + "zeetrf;": u"\u2128", + "zeta;": u"\u03b6", + "zfr;": u"\U0001d537", + "zhcy;": u"\u0436", + "zigrarr;": u"\u21dd", + "zopf;": u"\U0001d56b", + "zscr;": u"\U0001d4cf", + "zwj;": u"\u200d", + "zwnj;": u"\u200c", +} + +replacementCharacters = { + 0x0:u"\uFFFD", + 0x0d:u"\u000D", + 0x80:u"\u20AC", + 0x81:u"\u0081", + 0x81:u"\u0081", + 0x82:u"\u201A", + 0x83:u"\u0192", + 0x84:u"\u201E", + 0x85:u"\u2026", + 0x86:u"\u2020", + 0x87:u"\u2021", + 0x88:u"\u02C6", + 0x89:u"\u2030", + 0x8A:u"\u0160", + 0x8B:u"\u2039", + 0x8C:u"\u0152", + 0x8D:u"\u008D", + 0x8E:u"\u017D", + 0x8F:u"\u008F", + 0x90:u"\u0090", + 0x91:u"\u2018", + 0x92:u"\u2019", + 0x93:u"\u201C", + 0x94:u"\u201D", + 0x95:u"\u2022", + 0x96:u"\u2013", + 0x97:u"\u2014", + 0x98:u"\u02DC", + 0x99:u"\u2122", + 0x9A:u"\u0161", + 0x9B:u"\u203A", + 0x9C:u"\u0153", + 0x9D:u"\u009D", + 0x9E:u"\u017E", + 0x9F:u"\u0178", +} + +encodings = { + '437': 'cp437', + '850': 'cp850', + '852': 'cp852', + '855': 'cp855', + '857': 'cp857', + '860': 'cp860', + '861': 'cp861', + '862': 'cp862', + '863': 'cp863', + '865': 'cp865', + '866': 'cp866', + '869': 'cp869', + 'ansix341968': 'ascii', + 'ansix341986': 'ascii', + 'arabic': 'iso8859-6', + 'ascii': 'ascii', + 'asmo708': 'iso8859-6', + 'big5': 'big5', + 'big5hkscs': 'big5hkscs', + 'chinese': 'gbk', + 'cp037': 'cp037', + 'cp1026': 'cp1026', + 'cp154': 'ptcp154', + 'cp367': 'ascii', + 'cp424': 'cp424', + 'cp437': 'cp437', + 'cp500': 'cp500', + 'cp775': 'cp775', + 'cp819': 'windows-1252', + 'cp850': 'cp850', + 'cp852': 'cp852', + 'cp855': 'cp855', + 'cp857': 'cp857', + 'cp860': 'cp860', + 'cp861': 'cp861', + 'cp862': 'cp862', + 'cp863': 'cp863', + 'cp864': 'cp864', + 'cp865': 'cp865', + 'cp866': 'cp866', + 'cp869': 'cp869', + 'cp936': 'gbk', + 'cpgr': 'cp869', + 'cpis': 'cp861', + 'csascii': 'ascii', + 'csbig5': 'big5', + 'cseuckr': 'cp949', + 'cseucpkdfmtjapanese': 'euc_jp', + 'csgb2312': 'gbk', + 'cshproman8': 'hp-roman8', + 'csibm037': 'cp037', + 'csibm1026': 'cp1026', + 'csibm424': 'cp424', + 'csibm500': 'cp500', + 'csibm855': 'cp855', + 'csibm857': 'cp857', + 'csibm860': 'cp860', + 'csibm861': 'cp861', + 'csibm863': 'cp863', + 'csibm864': 'cp864', + 'csibm865': 'cp865', + 'csibm866': 'cp866', + 'csibm869': 'cp869', + 'csiso2022jp': 'iso2022_jp', + 'csiso2022jp2': 'iso2022_jp_2', + 'csiso2022kr': 'iso2022_kr', + 'csiso58gb231280': 'gbk', + 'csisolatin1': 'windows-1252', + 'csisolatin2': 'iso8859-2', + 'csisolatin3': 'iso8859-3', + 'csisolatin4': 'iso8859-4', + 'csisolatin5': 'windows-1254', + 'csisolatin6': 'iso8859-10', + 'csisolatinarabic': 'iso8859-6', + 'csisolatincyrillic': 'iso8859-5', + 'csisolatingreek': 'iso8859-7', + 'csisolatinhebrew': 'iso8859-8', + 'cskoi8r': 'koi8-r', + 'csksc56011987': 'cp949', + 'cspc775baltic': 'cp775', + 'cspc850multilingual': 'cp850', + 'cspc862latinhebrew': 'cp862', + 'cspc8codepage437': 'cp437', + 'cspcp852': 'cp852', + 'csptcp154': 'ptcp154', + 'csshiftjis': 'shift_jis', + 'csunicode11utf7': 'utf-7', + 'cyrillic': 'iso8859-5', + 'cyrillicasian': 'ptcp154', + 'ebcdiccpbe': 'cp500', + 'ebcdiccpca': 'cp037', + 'ebcdiccpch': 'cp500', + 'ebcdiccphe': 'cp424', + 'ebcdiccpnl': 'cp037', + 'ebcdiccpus': 'cp037', + 'ebcdiccpwt': 'cp037', + 'ecma114': 'iso8859-6', + 'ecma118': 'iso8859-7', + 'elot928': 'iso8859-7', + 'eucjp': 'euc_jp', + 'euckr': 'cp949', + 'extendedunixcodepackedformatforjapanese': 'euc_jp', + 'gb18030': 'gb18030', + 'gb2312': 'gbk', + 'gb231280': 'gbk', + 'gbk': 'gbk', + 'greek': 'iso8859-7', + 'greek8': 'iso8859-7', + 'hebrew': 'iso8859-8', + 'hproman8': 'hp-roman8', + 'hzgb2312': 'hz', + 'ibm037': 'cp037', + 'ibm1026': 'cp1026', + 'ibm367': 'ascii', + 'ibm424': 'cp424', + 'ibm437': 'cp437', + 'ibm500': 'cp500', + 'ibm775': 'cp775', + 'ibm819': 'windows-1252', + 'ibm850': 'cp850', + 'ibm852': 'cp852', + 'ibm855': 'cp855', + 'ibm857': 'cp857', + 'ibm860': 'cp860', + 'ibm861': 'cp861', + 'ibm862': 'cp862', + 'ibm863': 'cp863', + 'ibm864': 'cp864', + 'ibm865': 'cp865', + 'ibm866': 'cp866', + 'ibm869': 'cp869', + 'iso2022jp': 'iso2022_jp', + 'iso2022jp2': 'iso2022_jp_2', + 'iso2022kr': 'iso2022_kr', + 'iso646irv1991': 'ascii', + 'iso646us': 'ascii', + 'iso88591': 'windows-1252', + 'iso885910': 'iso8859-10', + 'iso8859101992': 'iso8859-10', + 'iso885911987': 'windows-1252', + 'iso885913': 'iso8859-13', + 'iso885914': 'iso8859-14', + 'iso8859141998': 'iso8859-14', + 'iso885915': 'iso8859-15', + 'iso885916': 'iso8859-16', + 'iso8859162001': 'iso8859-16', + 'iso88592': 'iso8859-2', + 'iso885921987': 'iso8859-2', + 'iso88593': 'iso8859-3', + 'iso885931988': 'iso8859-3', + 'iso88594': 'iso8859-4', + 'iso885941988': 'iso8859-4', + 'iso88595': 'iso8859-5', + 'iso885951988': 'iso8859-5', + 'iso88596': 'iso8859-6', + 'iso885961987': 'iso8859-6', + 'iso88597': 'iso8859-7', + 'iso885971987': 'iso8859-7', + 'iso88598': 'iso8859-8', + 'iso885981988': 'iso8859-8', + 'iso88599': 'windows-1254', + 'iso885991989': 'windows-1254', + 'isoceltic': 'iso8859-14', + 'isoir100': 'windows-1252', + 'isoir101': 'iso8859-2', + 'isoir109': 'iso8859-3', + 'isoir110': 'iso8859-4', + 'isoir126': 'iso8859-7', + 'isoir127': 'iso8859-6', + 'isoir138': 'iso8859-8', + 'isoir144': 'iso8859-5', + 'isoir148': 'windows-1254', + 'isoir149': 'cp949', + 'isoir157': 'iso8859-10', + 'isoir199': 'iso8859-14', + 'isoir226': 'iso8859-16', + 'isoir58': 'gbk', + 'isoir6': 'ascii', + 'koi8r': 'koi8-r', + 'koi8u': 'koi8-u', + 'korean': 'cp949', + 'ksc5601': 'cp949', + 'ksc56011987': 'cp949', + 'ksc56011989': 'cp949', + 'l1': 'windows-1252', + 'l10': 'iso8859-16', + 'l2': 'iso8859-2', + 'l3': 'iso8859-3', + 'l4': 'iso8859-4', + 'l5': 'windows-1254', + 'l6': 'iso8859-10', + 'l8': 'iso8859-14', + 'latin1': 'windows-1252', + 'latin10': 'iso8859-16', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'windows-1254', + 'latin6': 'iso8859-10', + 'latin8': 'iso8859-14', + 'latin9': 'iso8859-15', + 'ms936': 'gbk', + 'mskanji': 'shift_jis', + 'pt154': 'ptcp154', + 'ptcp154': 'ptcp154', + 'r8': 'hp-roman8', + 'roman8': 'hp-roman8', + 'shiftjis': 'shift_jis', + 'tis620': 'cp874', + 'unicode11utf7': 'utf-7', + 'us': 'ascii', + 'usascii': 'ascii', + 'utf16': 'utf-16', + 'utf16be': 'utf-16-be', + 'utf16le': 'utf-16-le', + 'utf8': 'utf-8', + 'windows1250': 'cp1250', + 'windows1251': 'cp1251', + 'windows1252': 'cp1252', + 'windows1253': 'cp1253', + 'windows1254': 'cp1254', + 'windows1255': 'cp1255', + 'windows1256': 'cp1256', + 'windows1257': 'cp1257', + 'windows1258': 'cp1258', + 'windows936': 'gbk', + 'x-x-big5': 'big5'} + +tokenTypes = { + "Doctype":0, + "Characters":1, + "SpaceCharacters":2, + "StartTag":3, + "EndTag":4, + "EmptyTag":5, + "Comment":6, + "ParseError":7 +} + +tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"])) + + +prefixes = dict([(v,k) for k,v in namespaces.iteritems()]) +prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + +class DataLossWarning(UserWarning): + pass + +class ReparseException(Exception): + pass diff --git a/html5lib/filters/__init__.py b/html5lib/filters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/html5lib/filters/_base.py b/html5lib/filters/_base.py new file mode 100644 index 00000000..bca94ada --- /dev/null +++ b/html5lib/filters/_base.py @@ -0,0 +1,10 @@ + +class Filter(object): + def __init__(self, source): + self.source = source + + def __iter__(self): + return iter(self.source) + + def __getattr__(self, name): + return getattr(self.source, name) diff --git a/html5lib/filters/formfiller.py b/html5lib/filters/formfiller.py new file mode 100644 index 00000000..94001714 --- /dev/null +++ b/html5lib/filters/formfiller.py @@ -0,0 +1,127 @@ +# +# The goal is to finally have a form filler where you pass data for +# each form, using the algorithm for "Seeding a form with initial values" +# See http://www.whatwg.org/specs/web-forms/current-work/#seeding +# + +import _base + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class SimpleFilter(_base.Filter): + def __init__(self, source, fieldStorage): + _base.Filter.__init__(self, source) + self.fieldStorage = fieldStorage + + def __iter__(self): + field_indices = {} + state = None + field_name = None + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"].lower() + if name == "input": + field_name = None + field_type = None + input_value_index = -1 + input_checked_index = -1 + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == u"name": + field_name = v.strip(spaceCharacters) + elif n == u"type": + field_type = v.strip(spaceCharacters) + elif n == u"checked": + input_checked_index = i + elif n == u"value": + input_value_index = i + + value_list = self.fieldStorage.getlist(field_name) + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + + if field_type in (u"checkbox", u"radio"): + if value_list: + if token["data"][input_value_index][1] == value: + if input_checked_index < 0: + token["data"].append((u"checked", u"")) + field_indices[field_name] = field_index + 1 + elif input_checked_index >= 0: + del token["data"][input_checked_index] + + elif field_type not in (u"button", u"submit", u"reset"): + if input_value_index >= 0: + token["data"][input_value_index] = (u"value", value) + else: + token["data"].append((u"value", value)) + field_indices[field_name] = field_index + 1 + + field_type = None + field_name = None + + elif name == "textarea": + field_type = "textarea" + field_name = dict((token["data"])[::-1])["name"] + + elif name == "select": + field_type = "select" + attributes = dict(token["data"][::-1]) + field_name = attributes.get("name") + is_select_multiple = "multiple" in attributes + is_selected_option_found = False + + elif field_type == "select" and field_name and name == "option": + option_selected_index = -1 + option_value = None + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == "selected": + option_selected_index = i + elif n == "value": + option_value = v.strip(spaceCharacters) + if option_value is None: + raise NotImplementedError("
      +#errors +Line: 1 Col: 9 Unexpected end tag (strong). Expected DOCTYPE. +Line: 1 Col: 9 Unexpected end tag (strong) after the (implied) root element. +Line: 1 Col: 13 Unexpected end tag (b) after the (implied) root element. +Line: 1 Col: 18 Unexpected end tag (em) after the (implied) root element. +Line: 1 Col: 22 Unexpected end tag (i) after the (implied) root element. +Line: 1 Col: 26 Unexpected end tag (u) after the (implied) root element. +Line: 1 Col: 35 Unexpected end tag (strike) after the (implied) root element. +Line: 1 Col: 39 Unexpected end tag (s) after the (implied) root element. +Line: 1 Col: 47 Unexpected end tag (blink) after the (implied) root element. +Line: 1 Col: 52 Unexpected end tag (tt) after the (implied) root element. +Line: 1 Col: 58 Unexpected end tag (pre) after the (implied) root element. +Line: 1 Col: 64 Unexpected end tag (big) after the (implied) root element. +Line: 1 Col: 72 Unexpected end tag (small) after the (implied) root element. +Line: 1 Col: 79 Unexpected end tag (font) after the (implied) root element. +Line: 1 Col: 88 Unexpected end tag (select) after the (implied) root element. +Line: 1 Col: 93 Unexpected end tag (h1) after the (implied) root element. +Line: 1 Col: 98 Unexpected end tag (h2) after the (implied) root element. +Line: 1 Col: 103 Unexpected end tag (h3) after the (implied) root element. +Line: 1 Col: 108 Unexpected end tag (h4) after the (implied) root element. +Line: 1 Col: 113 Unexpected end tag (h5) after the (implied) root element. +Line: 1 Col: 118 Unexpected end tag (h6) after the (implied) root element. +Line: 1 Col: 125 Unexpected end tag (body) after the (implied) root element. +Line: 1 Col: 130 Unexpected end tag (br). Treated as br element. +Line: 1 Col: 134 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 140 This element (img) has no end tag. +Line: 1 Col: 148 Unexpected end tag (title). Ignored. +Line: 1 Col: 155 Unexpected end tag (span). Ignored. +Line: 1 Col: 163 Unexpected end tag (style). Ignored. +Line: 1 Col: 172 Unexpected end tag (script). Ignored. +Line: 1 Col: 180 Unexpected end tag (table). Ignored. +Line: 1 Col: 185 Unexpected end tag (th). Ignored. +Line: 1 Col: 190 Unexpected end tag (td). Ignored. +Line: 1 Col: 195 Unexpected end tag (tr). Ignored. +Line: 1 Col: 203 This element (frame) has no end tag. +Line: 1 Col: 210 This element (area) has no end tag. +Line: 1 Col: 217 Unexpected end tag (link). Ignored. +Line: 1 Col: 225 This element (param) has no end tag. +Line: 1 Col: 230 This element (hr) has no end tag. +Line: 1 Col: 238 This element (input) has no end tag. +Line: 1 Col: 244 Unexpected end tag (col). Ignored. +Line: 1 Col: 251 Unexpected end tag (base). Ignored. +Line: 1 Col: 258 Unexpected end tag (meta). Ignored. +Line: 1 Col: 269 This element (basefont) has no end tag. +Line: 1 Col: 279 This element (bgsound) has no end tag. +Line: 1 Col: 287 This element (embed) has no end tag. +Line: 1 Col: 296 This element (spacer) has no end tag. +Line: 1 Col: 300 Unexpected end tag (p). Ignored. +Line: 1 Col: 305 End tag (dd) seen too early. Expected other end tag. +Line: 1 Col: 310 End tag (dt) seen too early. Expected other end tag. +Line: 1 Col: 320 Unexpected end tag (caption). Ignored. +Line: 1 Col: 331 Unexpected end tag (colgroup). Ignored. +Line: 1 Col: 339 Unexpected end tag (tbody). Ignored. +Line: 1 Col: 347 Unexpected end tag (tfoot). Ignored. +Line: 1 Col: 355 Unexpected end tag (thead). Ignored. +Line: 1 Col: 365 End tag (address) seen too early. Expected other end tag. +Line: 1 Col: 378 End tag (blockquote) seen too early. Expected other end tag. +Line: 1 Col: 387 End tag (center) seen too early. Expected other end tag. +Line: 1 Col: 393 Unexpected end tag (dir). Ignored. +Line: 1 Col: 399 End tag (div) seen too early. Expected other end tag. +Line: 1 Col: 404 End tag (dl) seen too early. Expected other end tag. +Line: 1 Col: 415 End tag (fieldset) seen too early. Expected other end tag. +Line: 1 Col: 425 End tag (listing) seen too early. Expected other end tag. +Line: 1 Col: 432 End tag (menu) seen too early. Expected other end tag. +Line: 1 Col: 437 End tag (ol) seen too early. Expected other end tag. +Line: 1 Col: 442 End tag (ul) seen too early. Expected other end tag. +Line: 1 Col: 447 End tag (li) seen too early. Expected other end tag. +Line: 1 Col: 454 End tag (nobr) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 460 This element (wbr) has no end tag. +Line: 1 Col: 476 End tag (button) seen too early. Expected other end tag. +Line: 1 Col: 486 End tag (marquee) seen too early. Expected other end tag. +Line: 1 Col: 495 End tag (object) seen too early. Expected other end tag. +Line: 1 Col: 513 Unexpected end tag (html). Ignored. +Line: 1 Col: 513 Unexpected end tag (frameset). Ignored. +Line: 1 Col: 520 Unexpected end tag (head). Ignored. +Line: 1 Col: 529 Unexpected end tag (iframe). Ignored. +Line: 1 Col: 537 This element (image) has no end tag. +Line: 1 Col: 547 This element (isindex) has no end tag. +Line: 1 Col: 557 Unexpected end tag (noembed). Ignored. +Line: 1 Col: 568 Unexpected end tag (noframes). Ignored. +Line: 1 Col: 579 Unexpected end tag (noscript). Ignored. +Line: 1 Col: 590 Unexpected end tag (optgroup). Ignored. +Line: 1 Col: 599 Unexpected end tag (option). Ignored. +Line: 1 Col: 611 Unexpected end tag (plaintext). Ignored. +Line: 1 Col: 622 Unexpected end tag (textarea). Ignored. +#document +| +| +| +|
      +|

      + +#data +

      +#errors +Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. +Line: 1 Col: 20 Unexpected end tag (strong) in table context caused voodoo mode. +Line: 1 Col: 20 End tag (strong) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 24 Unexpected end tag (b) in table context caused voodoo mode. +Line: 1 Col: 24 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 29 Unexpected end tag (em) in table context caused voodoo mode. +Line: 1 Col: 29 End tag (em) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 33 Unexpected end tag (i) in table context caused voodoo mode. +Line: 1 Col: 33 End tag (i) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 37 Unexpected end tag (u) in table context caused voodoo mode. +Line: 1 Col: 37 End tag (u) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 46 Unexpected end tag (strike) in table context caused voodoo mode. +Line: 1 Col: 46 End tag (strike) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 50 Unexpected end tag (s) in table context caused voodoo mode. +Line: 1 Col: 50 End tag (s) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 58 Unexpected end tag (blink) in table context caused voodoo mode. +Line: 1 Col: 58 Unexpected end tag (blink). Ignored. +Line: 1 Col: 63 Unexpected end tag (tt) in table context caused voodoo mode. +Line: 1 Col: 63 End tag (tt) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 69 Unexpected end tag (pre) in table context caused voodoo mode. +Line: 1 Col: 69 End tag (pre) seen too early. Expected other end tag. +Line: 1 Col: 75 Unexpected end tag (big) in table context caused voodoo mode. +Line: 1 Col: 75 End tag (big) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 83 Unexpected end tag (small) in table context caused voodoo mode. +Line: 1 Col: 83 End tag (small) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 90 Unexpected end tag (font) in table context caused voodoo mode. +Line: 1 Col: 90 End tag (font) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 99 Unexpected end tag (select) in table context caused voodoo mode. +Line: 1 Col: 99 Unexpected end tag (select). Ignored. +Line: 1 Col: 104 Unexpected end tag (h1) in table context caused voodoo mode. +Line: 1 Col: 104 End tag (h1) seen too early. Expected other end tag. +Line: 1 Col: 109 Unexpected end tag (h2) in table context caused voodoo mode. +Line: 1 Col: 109 End tag (h2) seen too early. Expected other end tag. +Line: 1 Col: 114 Unexpected end tag (h3) in table context caused voodoo mode. +Line: 1 Col: 114 End tag (h3) seen too early. Expected other end tag. +Line: 1 Col: 119 Unexpected end tag (h4) in table context caused voodoo mode. +Line: 1 Col: 119 End tag (h4) seen too early. Expected other end tag. +Line: 1 Col: 124 Unexpected end tag (h5) in table context caused voodoo mode. +Line: 1 Col: 124 End tag (h5) seen too early. Expected other end tag. +Line: 1 Col: 129 Unexpected end tag (h6) in table context caused voodoo mode. +Line: 1 Col: 129 End tag (h6) seen too early. Expected other end tag. +Line: 1 Col: 136 Unexpected end tag (body) in the table row phase. Ignored. +Line: 1 Col: 141 Unexpected end tag (br) in table context caused voodoo mode. +Line: 1 Col: 141 Unexpected end tag (br). Treated as br element. +Line: 1 Col: 145 Unexpected end tag (a) in table context caused voodoo mode. +Line: 1 Col: 145 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 151 Unexpected end tag (img) in table context caused voodoo mode. +Line: 1 Col: 151 This element (img) has no end tag. +Line: 1 Col: 159 Unexpected end tag (title) in table context caused voodoo mode. +Line: 1 Col: 159 Unexpected end tag (title). Ignored. +Line: 1 Col: 166 Unexpected end tag (span) in table context caused voodoo mode. +Line: 1 Col: 166 Unexpected end tag (span). Ignored. +Line: 1 Col: 174 Unexpected end tag (style) in table context caused voodoo mode. +Line: 1 Col: 174 Unexpected end tag (style). Ignored. +Line: 1 Col: 183 Unexpected end tag (script) in table context caused voodoo mode. +Line: 1 Col: 183 Unexpected end tag (script). Ignored. +Line: 1 Col: 196 Unexpected end tag (th). Ignored. +Line: 1 Col: 201 Unexpected end tag (td). Ignored. +Line: 1 Col: 206 Unexpected end tag (tr). Ignored. +Line: 1 Col: 214 This element (frame) has no end tag. +Line: 1 Col: 221 This element (area) has no end tag. +Line: 1 Col: 228 Unexpected end tag (link). Ignored. +Line: 1 Col: 236 This element (param) has no end tag. +Line: 1 Col: 241 This element (hr) has no end tag. +Line: 1 Col: 249 This element (input) has no end tag. +Line: 1 Col: 255 Unexpected end tag (col). Ignored. +Line: 1 Col: 262 Unexpected end tag (base). Ignored. +Line: 1 Col: 269 Unexpected end tag (meta). Ignored. +Line: 1 Col: 280 This element (basefont) has no end tag. +Line: 1 Col: 290 This element (bgsound) has no end tag. +Line: 1 Col: 298 This element (embed) has no end tag. +Line: 1 Col: 307 This element (spacer) has no end tag. +Line: 1 Col: 311 Unexpected end tag (p). Ignored. +Line: 1 Col: 316 End tag (dd) seen too early. Expected other end tag. +Line: 1 Col: 321 End tag (dt) seen too early. Expected other end tag. +Line: 1 Col: 331 Unexpected end tag (caption). Ignored. +Line: 1 Col: 342 Unexpected end tag (colgroup). Ignored. +Line: 1 Col: 350 Unexpected end tag (tbody). Ignored. +Line: 1 Col: 358 Unexpected end tag (tfoot). Ignored. +Line: 1 Col: 366 Unexpected end tag (thead). Ignored. +Line: 1 Col: 376 End tag (address) seen too early. Expected other end tag. +Line: 1 Col: 389 End tag (blockquote) seen too early. Expected other end tag. +Line: 1 Col: 398 End tag (center) seen too early. Expected other end tag. +Line: 1 Col: 404 Unexpected end tag (dir). Ignored. +Line: 1 Col: 410 End tag (div) seen too early. Expected other end tag. +Line: 1 Col: 415 End tag (dl) seen too early. Expected other end tag. +Line: 1 Col: 426 End tag (fieldset) seen too early. Expected other end tag. +Line: 1 Col: 436 End tag (listing) seen too early. Expected other end tag. +Line: 1 Col: 443 End tag (menu) seen too early. Expected other end tag. +Line: 1 Col: 448 End tag (ol) seen too early. Expected other end tag. +Line: 1 Col: 453 End tag (ul) seen too early. Expected other end tag. +Line: 1 Col: 458 End tag (li) seen too early. Expected other end tag. +Line: 1 Col: 465 End tag (nobr) violates step 1, paragraph 1 of the adoption agency algorithm. +Line: 1 Col: 471 This element (wbr) has no end tag. +Line: 1 Col: 487 End tag (button) seen too early. Expected other end tag. +Line: 1 Col: 497 End tag (marquee) seen too early. Expected other end tag. +Line: 1 Col: 506 End tag (object) seen too early. Expected other end tag. +Line: 1 Col: 524 Unexpected end tag (html). Ignored. +Line: 1 Col: 524 Unexpected end tag (frameset). Ignored. +Line: 1 Col: 531 Unexpected end tag (head). Ignored. +Line: 1 Col: 540 Unexpected end tag (iframe). Ignored. +Line: 1 Col: 548 This element (image) has no end tag. +Line: 1 Col: 558 This element (isindex) has no end tag. +Line: 1 Col: 568 Unexpected end tag (noembed). Ignored. +Line: 1 Col: 579 Unexpected end tag (noframes). Ignored. +Line: 1 Col: 590 Unexpected end tag (noscript). Ignored. +Line: 1 Col: 601 Unexpected end tag (optgroup). Ignored. +Line: 1 Col: 610 Unexpected end tag (option). Ignored. +Line: 1 Col: 622 Unexpected end tag (plaintext). Ignored. +Line: 1 Col: 633 Unexpected end tag (textarea). Ignored. +#document +| +| +| +|
      +| +| +| +|

      + +#data + +#errors +Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE. +Line: 1 Col: 10 Expected closing tag. Unexpected end of file. +#document +| +| +| diff --git a/html5lib/tests/testdata/tree-construction/tests10.dat b/html5lib/tests/testdata/tree-construction/tests10.dat new file mode 100644 index 00000000..4f8df86f --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests10.dat @@ -0,0 +1,799 @@ +#data + +#errors +#document +| +| +| +| +| + +#data +a +#errors +29: Bogus comment +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| + +#data + +#errors +35: Stray “svg” start tag. +42: Stray end tag “svg” +#document +| +| +| +| +| +#errors +43: Stray “svg” start tag. +50: Stray end tag “svg” +#document +| +| +| +| +|

      +#errors +34: Start tag “svg” seen in “table”. +41: Stray end tag “svg”. +#document +| +| +| +| +| +| + +#data +
      foo
      +#errors +34: Start tag “svg” seen in “table”. +46: Stray end tag “g”. +53: Stray end tag “svg”. +#document +| +| +| +| +| +| +| "foo" +| + +#data +
      foobar
      +#errors +34: Start tag “svg” seen in “table”. +46: Stray end tag “g”. +58: Stray end tag “g”. +65: Stray end tag “svg”. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +| + +#data +
      foobar
      +#errors +41: Start tag “svg” seen in “table”. +53: Stray end tag “g”. +65: Stray end tag “g”. +72: Stray end tag “svg”. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +| +| + +#data +
      foobar
      +#errors +45: Start tag “svg” seen in “table”. +57: Stray end tag “g”. +69: Stray end tag “g”. +76: Stray end tag “svg”. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +| +| +| + +#data +
      foobar
      +#errors +#document +| +| +| +| +| +| +| +|
      +| +| +| "foo" +| +| "bar" + +#data +
      foobar

      baz

      +#errors +#document +| +| +| +| +| +| +| +|
      +| +| +| "foo" +| +| "bar" +|

      +| "baz" + +#data +
      foobar

      baz

      +#errors +#document +| +| +| +| +| +|
      +| +| +| "foo" +| +| "bar" +|

      +| "baz" + +#data +
      foobar

      baz

      quux +#errors +70: HTML start tag “p” in a foreign namespace context. +81: “table” closed but “caption” was still open. +#document +| +| +| +| +| +|
      +| +| +| "foo" +| +| "bar" +|

      +| "baz" +|

      +| "quux" + +#data +
      foobarbaz

      quux +#errors +78: “table” closed but “caption” was still open. +78: Unclosed elements on stack. +#document +| +| +| +| +| +|
      +| +| +| "foo" +| +| "bar" +| "baz" +|

      +| "quux" + +#data +foobar

      baz

      quux +#errors +44: Start tag “svg” seen in “table”. +56: Stray end tag “g”. +68: Stray end tag “g”. +71: HTML start tag “p” in a foreign namespace context. +71: Start tag “p” seen in “table”. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +|

      +| "baz" +| +| +|

      +| "quux" + +#data +

      quux +#errors +50: Stray “svg” start tag. +54: Stray “g” start tag. +62: Stray end tag “g” +66: Stray “g” start tag. +74: Stray end tag “g” +77: Stray “p” start tag. +88: “table” end tag with “select” open. +#document +| +| +| +| +| +| +| +|
      +|

      quux +#errors +36: Start tag “select” seen in “table”. +42: Stray “svg” start tag. +46: Stray “g” start tag. +54: Stray end tag “g” +58: Stray “g” start tag. +66: Stray end tag “g” +69: Stray “p” start tag. +80: “table” end tag with “select” open. +#document +| +| +| +| +| +|

      +| "quux" + +#data +foobar

      baz +#errors +41: Stray “svg” start tag. +68: HTML start tag “p” in a foreign namespace context. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +|

      +| "baz" + +#data +foobar

      baz +#errors +34: Stray “svg” start tag. +61: HTML start tag “p” in a foreign namespace context. +#document +| +| +| +| +| +| +| "foo" +| +| "bar" +|

      +| "baz" + +#data +

      +#errors +31: Stray “svg” start tag. +35: Stray “g” start tag. +40: Stray end tag “g” +44: Stray “g” start tag. +49: Stray end tag “g” +52: Stray “p” start tag. +58: Stray “span” start tag. +58: End of file seen and there were open elements. +#document +| +| +| +| + +#data +

      +#errors +42: Stray “svg” start tag. +46: Stray “g” start tag. +51: Stray end tag “g” +55: Stray “g” start tag. +60: Stray end tag “g” +63: Stray “p” start tag. +69: Stray “span” start tag. +#document +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| xlink:href="foo" +| +| xlink href="foo" + +#data + +#errors +#document +| +| +| +| +| xlink:href="foo" +| xml:lang="en" +| +| +| xlink href="foo" +| xml lang="en" + +#data + +#errors +#document +| +| +| +| +| xlink:href="foo" +| xml:lang="en" +| +| +| xlink href="foo" +| xml lang="en" + +#data +bar +#errors +#document +| +| +| +| +| xlink:href="foo" +| xml:lang="en" +| +| +| xlink href="foo" +| xml lang="en" +| "bar" + +#data + +#errors +#document +| +| +| +| + +#data +

      a +#errors +#document +| +| +| +|
      +| +| "a" + +#data +
      a +#errors +#document +| +| +| +|
      +| +| +| "a" + +#data +
      +#errors +#document +| +| +| +|
      +| +| +| + +#data +
      a +#errors +#document +| +| +| +|
      +| +| +| +| +| "a" + +#data +

      a +#errors +#document +| +| +| +|

      +| +| +| +|

      +| "a" + +#data +
        a +#errors +40: HTML start tag “ul” in a foreign namespace context. +41: End of file in a foreign namespace context. +#document +| +| +| +| +| +| +|
        +| +|
          +| "a" + +#data +
            a +#errors +35: HTML start tag “ul” in a foreign namespace context. +36: End of file in a foreign namespace context. +#document +| +| +| +| +| +| +| +|
              +| "a" + +#data +

              +#errors +#document +| +| +| +| +|

              +| +| +|

              + +#data +

              +#errors +#document +| +| +| +| +|

              +| +| +|

              + +#data +

              +#errors +#document +| +| +| +|

              +| +| +| +|

              +|

              + +#data +
              +#errors +#document +| +| +| +| +| +|
              +| +|
              +| +| + +#data +
              +#errors +#document +| +| +| +| +| +| +| +|
              +|
              +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data +

      +#errors +#document +| +| +| +| +|
      +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| +| + +#data +
      +#errors +#document +| +| +| +| +| +| +| +|
      +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| +| +| +| +| +| +| +| +| diff --git a/html5lib/tests/testdata/tree-construction/tests11.dat b/html5lib/tests/testdata/tree-construction/tests11.dat new file mode 100644 index 00000000..638cde47 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests11.dat @@ -0,0 +1,482 @@ +#data + +#errors +#document +| +| +| +| +| +| attributeName="" +| attributeType="" +| baseFrequency="" +| baseProfile="" +| calcMode="" +| clipPathUnits="" +| contentScriptType="" +| contentStyleType="" +| diffuseConstant="" +| edgeMode="" +| externalResourcesRequired="" +| filterRes="" +| filterUnits="" +| glyphRef="" +| gradientTransform="" +| gradientUnits="" +| kernelMatrix="" +| kernelUnitLength="" +| keyPoints="" +| keySplines="" +| keyTimes="" +| lengthAdjust="" +| limitingConeAngle="" +| markerHeight="" +| markerUnits="" +| markerWidth="" +| maskContentUnits="" +| maskUnits="" +| numOctaves="" +| pathLength="" +| patternContentUnits="" +| patternTransform="" +| patternUnits="" +| pointsAtX="" +| pointsAtY="" +| pointsAtZ="" +| preserveAlpha="" +| preserveAspectRatio="" +| primitiveUnits="" +| refX="" +| refY="" +| repeatCount="" +| repeatDur="" +| requiredExtensions="" +| requiredFeatures="" +| specularConstant="" +| specularExponent="" +| spreadMethod="" +| startOffset="" +| stdDeviation="" +| stitchTiles="" +| surfaceScale="" +| systemLanguage="" +| tableValues="" +| targetX="" +| targetY="" +| textLength="" +| viewBox="" +| viewTarget="" +| xChannelSelector="" +| yChannelSelector="" +| zoomAndPan="" + +#data + +#errors +#document +| +| +| +| +| +| attributeName="" +| attributeType="" +| baseFrequency="" +| baseProfile="" +| calcMode="" +| clipPathUnits="" +| contentScriptType="" +| contentStyleType="" +| diffuseConstant="" +| edgeMode="" +| externalResourcesRequired="" +| filterRes="" +| filterUnits="" +| glyphRef="" +| gradientTransform="" +| gradientUnits="" +| kernelMatrix="" +| kernelUnitLength="" +| keyPoints="" +| keySplines="" +| keyTimes="" +| lengthAdjust="" +| limitingConeAngle="" +| markerHeight="" +| markerUnits="" +| markerWidth="" +| maskContentUnits="" +| maskUnits="" +| numOctaves="" +| pathLength="" +| patternContentUnits="" +| patternTransform="" +| patternUnits="" +| pointsAtX="" +| pointsAtY="" +| pointsAtZ="" +| preserveAlpha="" +| preserveAspectRatio="" +| primitiveUnits="" +| refX="" +| refY="" +| repeatCount="" +| repeatDur="" +| requiredExtensions="" +| requiredFeatures="" +| specularConstant="" +| specularExponent="" +| spreadMethod="" +| startOffset="" +| stdDeviation="" +| stitchTiles="" +| surfaceScale="" +| systemLanguage="" +| tableValues="" +| targetX="" +| targetY="" +| textLength="" +| viewBox="" +| viewTarget="" +| xChannelSelector="" +| yChannelSelector="" +| zoomAndPan="" + +#data + +#errors +#document +| +| +| +| +| +| attributeName="" +| attributeType="" +| baseFrequency="" +| baseProfile="" +| calcMode="" +| clipPathUnits="" +| contentScriptType="" +| contentStyleType="" +| diffuseConstant="" +| edgeMode="" +| externalResourcesRequired="" +| filterRes="" +| filterUnits="" +| glyphRef="" +| gradientTransform="" +| gradientUnits="" +| kernelMatrix="" +| kernelUnitLength="" +| keyPoints="" +| keySplines="" +| keyTimes="" +| lengthAdjust="" +| limitingConeAngle="" +| markerHeight="" +| markerUnits="" +| markerWidth="" +| maskContentUnits="" +| maskUnits="" +| numOctaves="" +| pathLength="" +| patternContentUnits="" +| patternTransform="" +| patternUnits="" +| pointsAtX="" +| pointsAtY="" +| pointsAtZ="" +| preserveAlpha="" +| preserveAspectRatio="" +| primitiveUnits="" +| refX="" +| refY="" +| repeatCount="" +| repeatDur="" +| requiredExtensions="" +| requiredFeatures="" +| specularConstant="" +| specularExponent="" +| spreadMethod="" +| startOffset="" +| stdDeviation="" +| stitchTiles="" +| surfaceScale="" +| systemLanguage="" +| tableValues="" +| targetX="" +| targetY="" +| textLength="" +| viewBox="" +| viewTarget="" +| xChannelSelector="" +| yChannelSelector="" +| zoomAndPan="" + +#data + +#errors +#document +| +| +| +| +| +| attributename="" +| attributetype="" +| basefrequency="" +| baseprofile="" +| calcmode="" +| clippathunits="" +| contentscripttype="" +| contentstyletype="" +| diffuseconstant="" +| edgemode="" +| externalresourcesrequired="" +| filterres="" +| filterunits="" +| glyphref="" +| gradienttransform="" +| gradientunits="" +| kernelmatrix="" +| kernelunitlength="" +| keypoints="" +| keysplines="" +| keytimes="" +| lengthadjust="" +| limitingconeangle="" +| markerheight="" +| markerunits="" +| markerwidth="" +| maskcontentunits="" +| maskunits="" +| numoctaves="" +| pathlength="" +| patterncontentunits="" +| patterntransform="" +| patternunits="" +| pointsatx="" +| pointsaty="" +| pointsatz="" +| preservealpha="" +| preserveaspectratio="" +| primitiveunits="" +| refx="" +| refy="" +| repeatcount="" +| repeatdur="" +| requiredextensions="" +| requiredfeatures="" +| specularconstant="" +| specularexponent="" +| spreadmethod="" +| startoffset="" +| stddeviation="" +| stitchtiles="" +| surfacescale="" +| systemlanguage="" +| tablevalues="" +| targetx="" +| targety="" +| textlength="" +| viewbox="" +| viewtarget="" +| xchannelselector="" +| ychannelselector="" +| zoomandpan="" + +#data + +#errors +#document +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| diff --git a/html5lib/tests/testdata/tree-construction/tests12.dat b/html5lib/tests/testdata/tree-construction/tests12.dat new file mode 100644 index 00000000..63107d27 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests12.dat @@ -0,0 +1,62 @@ +#data +

      foobazeggs

      spam

      quuxbar +#errors +#document +| +| +| +| +|

      +| "foo" +| +| +| +| "baz" +| +| +| +| +| "eggs" +| +| +|

      +| "spam" +| +| +| +|
      +| +| +| "quux" +| "bar" + +#data +foobazeggs

      spam
      quuxbar +#errors +#document +| +| +| +| +| "foo" +| +| +| +| "baz" +| +| +| +| +| "eggs" +| +| +|

      +| "spam" +| +| +| +|
      +| +| +| "quux" +| "bar" diff --git a/html5lib/tests/testdata/tree-construction/tests14.dat b/html5lib/tests/testdata/tree-construction/tests14.dat new file mode 100644 index 00000000..b8713f88 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests14.dat @@ -0,0 +1,74 @@ +#data + +#errors +#document +| +| +| +| +| + +#data + +#errors +#document +| +| +| +| +| +| + +#data + +#errors +15: Unexpected start tag html +#document +| +| +| abc:def="gh" +| +| +| + +#data + +#errors +15: Unexpected start tag html +#document +| +| +| xml:lang="bar" +| +| + +#data + +#errors +#document +| +| +| 123="456" +| +| + +#data + +#errors +#document +| +| +| 123="456" +| 789="012" +| +| + +#data + +#errors +#document +| +| +| +| +| 789="012" diff --git a/html5lib/tests/testdata/tree-construction/tests15.dat b/html5lib/tests/testdata/tree-construction/tests15.dat new file mode 100644 index 00000000..6ce1c0d1 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests15.dat @@ -0,0 +1,208 @@ +#data +

      X +#errors +Line: 1 Col: 31 Unexpected end tag (p). Ignored. +Line: 1 Col: 36 Expected closing tag. Unexpected end of file. +#document +| +| +| +| +|

      +| +| +| +| +| +| +| " " +|

      +| "X" + +#data +

      +

      X +#errors +Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. +Line: 1 Col: 16 Unexpected end tag (p). Ignored. +Line: 2 Col: 4 Expected closing tag. Unexpected end of file. +#document +| +| +| +|

      +| +| +| +| +| +| +| " +" +|

      +| "X" + +#data + +#errors +Line: 1 Col: 22 Unexpected end tag (html) after the (implied) root element. +#document +| +| +| +| +| " " + +#data + +#errors +Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element. +#document +| +| +| +| +| + +#data + +#errors +Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. +Line: 1 Col: 13 Unexpected end tag (html) after the (implied) root element. +#document +| +| +| +| + +#data +X +#errors +Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element. +#document +| +| +| +| +| +| "X" + +#data +<!doctype html><table> X<meta></table> +#errors +Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode. +Line: 1 Col: 30 Unexpected start tag (meta) in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| " X" +| <meta> +| <table> + +#data +<!doctype html><table> x</table> +#errors +Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| " x" +| <table> + +#data +<!doctype html><table> x </table> +#errors +Line: 1 Col: 25 Unexpected non-space characters in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| " x " +| <table> + +#data +<!doctype html><table><tr> x</table> +#errors +Line: 1 Col: 28 Unexpected non-space characters in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| " x" +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><table>X<style> <tr>x </style> </table> +#errors +Line: 1 Col: 23 Unexpected non-space characters in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "X" +| <table> +| <style> +| " <tr>x " +| " " + +#data +<!doctype html><div><table><a>foo</a> <tr><td>bar</td> </tr></table></div> +#errors +Line: 1 Col: 30 Unexpected start tag (a) in table context caused voodoo mode. +Line: 1 Col: 37 Unexpected end tag (a) in table context caused voodoo mode. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <div> +| <a> +| "foo" +| <table> +| " " +| <tbody> +| <tr> +| <td> +| "bar" +| " " + +#data +<frame></frame></frame><frameset><frame><frameset><frame></frameset><noframes></frameset><noframes> +#errors +6: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. +13: Stray start tag “frame”. +21: Stray end tag “frame”. +29: Stray end tag “frame”. +39: “frameset” start tag after “body” already open. +105: End of file seen inside an [R]CDATA element. +105: End of file seen and there were open elements. +XXX: These errors are wrong, please fix me! +#document +| <html> +| <head> +| <frameset> +| <frame> +| <frameset> +| <frame> +| <noframes> +| "</frameset><noframes>" + +#data +<!DOCTYPE html><object></html> +#errors +1: Expected closing tag. Unexpected end of file +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <object> diff --git a/html5lib/tests/testdata/tree-construction/tests16.dat b/html5lib/tests/testdata/tree-construction/tests16.dat new file mode 100644 index 00000000..c8ef66f0 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests16.dat @@ -0,0 +1,2299 @@ +#data +<!doctype html><script> +#errors +Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| <body> + +#data +<!doctype html><script>a +#errors +Line: 1 Col: 24 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "a" +| <body> + +#data +<!doctype html><script>< +#errors +Line: 1 Col: 24 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<" +| <body> + +#data +<!doctype html><script></ +#errors +Line: 1 Col: 25 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</" +| <body> + +#data +<!doctype html><script></S +#errors +Line: 1 Col: 26 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</S" +| <body> + +#data +<!doctype html><script></SC +#errors +Line: 1 Col: 27 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</SC" +| <body> + +#data +<!doctype html><script></SCR +#errors +Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</SCR" +| <body> + +#data +<!doctype html><script></SCRI +#errors +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</SCRI" +| <body> + +#data +<!doctype html><script></SCRIP +#errors +Line: 1 Col: 30 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</SCRIP" +| <body> + +#data +<!doctype html><script></SCRIPT +#errors +Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</SCRIPT" +| <body> + +#data +<!doctype html><script></SCRIPT +#errors +Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| <body> + +#data +<!doctype html><script></s +#errors +Line: 1 Col: 26 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</s" +| <body> + +#data +<!doctype html><script></sc +#errors +Line: 1 Col: 27 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</sc" +| <body> + +#data +<!doctype html><script></scr +#errors +Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</scr" +| <body> + +#data +<!doctype html><script></scri +#errors +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</scri" +| <body> + +#data +<!doctype html><script></scrip +#errors +Line: 1 Col: 30 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</scrip" +| <body> + +#data +<!doctype html><script></script +#errors +Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "</script" +| <body> + +#data +<!doctype html><script></script +#errors +Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| <body> + +#data +<!doctype html><script><! +#errors +Line: 1 Col: 25 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!" +| <body> + +#data +<!doctype html><script><!a +#errors +Line: 1 Col: 26 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!a" +| <body> + +#data +<!doctype html><script><!- +#errors +Line: 1 Col: 26 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!-" +| <body> + +#data +<!doctype html><script><!-a +#errors +Line: 1 Col: 27 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!-a" +| <body> + +#data +<!doctype html><script><!-- +#errors +Line: 1 Col: 27 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--" +| <body> + +#data +<!doctype html><script><!--a +#errors +Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--a" +| <body> + +#data +<!doctype html><script><!--< +#errors +Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<" +| <body> + +#data +<!doctype html><script><!--<a +#errors +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<a" +| <body> + +#data +<!doctype html><script><!--</ +#errors +Line: 1 Col: 27 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--</" +| <body> + +#data +<!doctype html><script><!--</script +#errors +Line: 1 Col: 35 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--</script" +| <body> + +#data +<!doctype html><script><!--</script +#errors +Line: 1 Col: 36 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--" +| <body> + +#data +<!doctype html><script><!--<s +#errors +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<s" +| <body> + +#data +<!doctype html><script><!--<script +#errors +Line: 1 Col: 34 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script" +| <body> + +#data +<!doctype html><script><!--<script +#errors +Line: 1 Col: 35 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script " +| <body> + +#data +<!doctype html><script><!--<script < +#errors +Line: 1 Col: 36 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script <" +| <body> + +#data +<!doctype html><script><!--<script <a +#errors +Line: 1 Col: 37 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script <a" +| <body> + +#data +<!doctype html><script><!--<script </ +#errors +Line: 1 Col: 37 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </" +| <body> + +#data +<!doctype html><script><!--<script </s +#errors +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </s" +| <body> + +#data +<!doctype html><script><!--<script </script +#errors +Line: 1 Col: 43 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script" +| <body> + +#data +<!doctype html><script><!--<script </scripta +#errors +Line: 1 Col: 44 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </scripta" +| <body> + +#data +<!doctype html><script><!--<script </script +#errors +Line: 1 Col: 44 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<!doctype html><script><!--<script </script> +#errors +Line: 1 Col: 44 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script>" +| <body> + +#data +<!doctype html><script><!--<script </script/ +#errors +Line: 1 Col: 44 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script/" +| <body> + +#data +<!doctype html><script><!--<script </script < +#errors +Line: 1 Col: 45 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script <" +| <body> + +#data +<!doctype html><script><!--<script </script <a +#errors +Line: 1 Col: 46 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script <a" +| <body> + +#data +<!doctype html><script><!--<script </script </ +#errors +Line: 1 Col: 46 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script </" +| <body> + +#data +<!doctype html><script><!--<script </script </script +#errors +Line: 1 Col: 52 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script </script" +| <body> + +#data +<!doctype html><script><!--<script </script </script +#errors +Line: 1 Col: 53 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<!doctype html><script><!--<script </script </script/ +#errors +Line: 1 Col: 53 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<!doctype html><script><!--<script </script </script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<!doctype html><script><!--<script - +#errors +Line: 1 Col: 36 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -" +| <body> + +#data +<!doctype html><script><!--<script -a +#errors +Line: 1 Col: 37 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -a" +| <body> + +#data +<!doctype html><script><!--<script -< +#errors +Line: 1 Col: 37 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -<" +| <body> + +#data +<!doctype html><script><!--<script -- +#errors +Line: 1 Col: 37 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --" +| <body> + +#data +<!doctype html><script><!--<script --a +#errors +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --a" +| <body> + +#data +<!doctype html><script><!--<script --< +#errors +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --<" +| <body> + +#data +<!doctype html><script><!--<script --> +#errors +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<!doctype html><script><!--<script -->< +#errors +Line: 1 Col: 39 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --><" +| <body> + +#data +<!doctype html><script><!--<script --></ +#errors +Line: 1 Col: 40 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --></" +| <body> + +#data +<!doctype html><script><!--<script --></script +#errors +Line: 1 Col: 46 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script --></script" +| <body> + +#data +<!doctype html><script><!--<script --></script +#errors +Line: 1 Col: 47 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<!doctype html><script><!--<script --></script/ +#errors +Line: 1 Col: 47 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<!doctype html><script><!--<script --></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<!doctype html><script><!--<script><\/script>--></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script><\/script>-->" +| <body> + +#data +<!doctype html><script><!--<script></scr'+'ipt>--></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></scr'+'ipt>-->" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script>--><!--</script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>--><!--" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script>-- ></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>-- >" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script>- -></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>- ->" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script>- - ></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>- - >" +| <body> + +#data +<!doctype html><script><!--<script></script><script></script>-></script> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>->" +| <body> + +#data +<!doctype html><script><!--<script>--!></script>X +#errors +Line: 1 Col: 49 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script>--!></script>X" +| <body> + +#data +<!doctype html><script><!--<scr'+'ipt></script>--></script> +#errors +Line: 1 Col: 59 Unexpected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<scr'+'ipt>" +| <body> +| "-->" + +#data +<!doctype html><script><!--<script></scr'+'ipt></script>X +#errors +Line: 1 Col: 57 Unexpected end of file. Expected end tag (script). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| "<!--<script></scr'+'ipt></script>X" +| <body> + +#data +<!doctype html><style><!--<style></style>--></style> +#errors +Line: 1 Col: 52 Unexpected end tag (style). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--<style>" +| <body> +| "-->" + +#data +<!doctype html><style><!--</style>X +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--" +| <body> +| "X" + +#data +<!doctype html><style><!--...</style>...--></style> +#errors +Line: 1 Col: 51 Unexpected end tag (style). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--..." +| <body> +| "...-->" + +#data +<!doctype html><style><!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style></style>X +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style>" +| <body> +| "X" + +#data +<!doctype html><style><!--...<style><!--...--!></style>--></style> +#errors +Line: 1 Col: 66 Unexpected end tag (style). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--...<style><!--...--!>" +| <body> +| "-->" + +#data +<!doctype html><style><!--...</style><!-- --><style>@import ...</style> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "<!--..." +| <!-- --> +| <style> +| "@import ..." +| <body> + +#data +<!doctype html><style>...<style><!--...</style><!-- --></style> +#errors +Line: 1 Col: 63 Unexpected end tag (style). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "...<style><!--..." +| <!-- --> +| <body> + +#data +<!doctype html><style>...<!--[if IE]><style>...</style>X +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <style> +| "...<!--[if IE]><style>..." +| <body> +| "X" + +#data +<!doctype html><title><!--<title>--> +#errors +Line: 1 Col: 52 Unexpected end tag (title). +#document +| +| +| +| +| "<!--<title>" +| <body> +| "-->" + +#data +<!doctype html><title></title> +#errors +#document +| +| +| +| +| "" +| + +#data +foo/title><link></head><body>X +#errors +Line: 1 Col: 52 Unexpected end of file. Expected end tag (title). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <title> +| "foo/title><link></head><body>X" +| <body> + +#data +<!doctype html><noscript><!--<noscript></noscript>--></noscript> +#errors +Line: 1 Col: 64 Unexpected end tag (noscript). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <noscript> +| "<!--<noscript>" +| <body> +| "-->" + +#data +<!doctype html><noscript><!--</noscript>X<noscript>--></noscript> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <noscript> +| "<!--" +| <body> +| "X" +| <noscript> +| "-->" + +#data +<!doctype html><noscript><iframe></noscript>X +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <noscript> +| "<iframe>" +| <body> +| "X" + +#data +<!doctype html><noframes><!--<noframes></noframes>--></noframes> +#errors +Line: 1 Col: 64 Unexpected end tag (noframes). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <noframes> +| "<!--<noframes>" +| <body> +| "-->" + +#data +<!doctype html><noframes><body><script><!--...</script></body></noframes></html> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <noframes> +| "<body><script><!--...</script></body>" +| <body> + +#data +<!doctype html><textarea><!--<textarea></textarea>--></textarea> +#errors +Line: 1 Col: 64 Unexpected end tag (textarea). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <textarea> +| "<!--<textarea>" +| "-->" + +#data +<!doctype html><textarea></textarea></textarea> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <textarea> +| "</textarea>" + +#data +<!doctype html><textarea><</textarea> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <textarea> +| "<" + +#data +<!doctype html><textarea>a<b</textarea> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <textarea> +| "a<b" + +#data +<!doctype html><iframe><!--<iframe></iframe>--></iframe> +#errors +Line: 1 Col: 56 Unexpected end tag (iframe). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <iframe> +| "<!--<iframe>" +| "-->" + +#data +<!doctype html><iframe>...<!--X->...<!--/X->...</iframe> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <iframe> +| "...<!--X->...<!--/X->..." + +#data +<!doctype html><xmp><!--<xmp></xmp>--></xmp> +#errors +Line: 1 Col: 44 Unexpected end tag (xmp). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <xmp> +| "<!--<xmp>" +| "-->" + +#data +<!doctype html><noembed><!--<noembed></noembed>--></noembed> +#errors +Line: 1 Col: 60 Unexpected end tag (noembed). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <noembed> +| "<!--<noembed>" +| "-->" + +#data +<script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 8 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| <body> + +#data +<script>a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 9 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "a" +| <body> + +#data +<script>< +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 9 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<" +| <body> + +#data +<script></ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 10 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</" +| <body> + +#data +<script></S +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</S" +| <body> + +#data +<script></SC +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</SC" +| <body> + +#data +<script></SCR +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</SCR" +| <body> + +#data +<script></SCRI +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</SCRI" +| <body> + +#data +<script></SCRIP +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 15 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</SCRIP" +| <body> + +#data +<script></SCRIPT +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 16 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</SCRIPT" +| <body> + +#data +<script></SCRIPT +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 17 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| <body> + +#data +<script></s +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</s" +| <body> + +#data +<script></sc +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</sc" +| <body> + +#data +<script></scr +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</scr" +| <body> + +#data +<script></scri +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</scri" +| <body> + +#data +<script></scrip +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 15 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</scrip" +| <body> + +#data +<script></script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 16 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</script" +| <body> + +#data +<script></script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 17 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| <body> + +#data +<script><! +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 10 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!" +| <body> + +#data +<script><!a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!a" +| <body> + +#data +<script><!- +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!-" +| <body> + +#data +<script><!-a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!-a" +| <body> + +#data +<script><!-- +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--" +| <body> + +#data +<script><!--a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--a" +| <body> + +#data +<script><!--< +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<" +| <body> + +#data +<script><!--<a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<a" +| <body> + +#data +<script><!--</ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--</" +| <body> + +#data +<script><!--</script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 20 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--</script" +| <body> + +#data +<script><!--</script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--" +| <body> + +#data +<script><!--<s +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<s" +| <body> + +#data +<script><!--<script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 19 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script" +| <body> + +#data +<script><!--<script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 20 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script " +| <body> + +#data +<script><!--<script < +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script <" +| <body> + +#data +<script><!--<script <a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script <a" +| <body> + +#data +<script><!--<script </ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </" +| <body> + +#data +<script><!--<script </s +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </s" +| <body> + +#data +<script><!--<script </script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script" +| <body> + +#data +<script><!--<script </scripta +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </scripta" +| <body> + +#data +<script><!--<script </script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<script><!--<script </script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script>" +| <body> + +#data +<script><!--<script </script/ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script/" +| <body> + +#data +<script><!--<script </script < +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 30 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script <" +| <body> + +#data +<script><!--<script </script <a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script <a" +| <body> + +#data +<script><!--<script </script </ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script </" +| <body> + +#data +<script><!--<script </script </script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script </script" +| <body> + +#data +<script><!--<script </script </script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<script><!--<script </script </script/ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<script><!--<script </script </script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script </script " +| <body> + +#data +<script><!--<script - +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script -" +| <body> + +#data +<script><!--<script -a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script -a" +| <body> + +#data +<script><!--<script -- +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script --" +| <body> + +#data +<script><!--<script --a +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script --a" +| <body> + +#data +<script><!--<script --> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<script><!--<script -->< +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 24 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script --><" +| <body> + +#data +<script><!--<script --></ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 25 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script --></" +| <body> + +#data +<script><!--<script --></script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script --></script" +| <body> + +#data +<script><!--<script --></script +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<script><!--<script --></script/ +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<script><!--<script --></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script -->" +| <body> + +#data +<script><!--<script><\/script>--></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script><\/script>-->" +| <body> + +#data +<script><!--<script></scr'+'ipt>--></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></scr'+'ipt>-->" +| <body> + +#data +<script><!--<script></script><script></script></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>" +| <body> + +#data +<script><!--<script></script><script></script>--><!--</script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>--><!--" +| <body> + +#data +<script><!--<script></script><script></script>-- ></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>-- >" +| <body> + +#data +<script><!--<script></script><script></script>- -></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>- ->" +| <body> + +#data +<script><!--<script></script><script></script>- - ></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>- - >" +| <body> + +#data +<script><!--<script></script><script></script>-></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +#document +| <html> +| <head> +| <script> +| "<!--<script></script><script></script>->" +| <body> + +#data +<script><!--<script>--!></script>X +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 34 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script>--!></script>X" +| <body> + +#data +<script><!--<scr'+'ipt></script>--></script> +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 44 Unexpected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<scr'+'ipt>" +| <body> +| "-->" + +#data +<script><!--<script></scr'+'ipt></script>X +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 42 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "<!--<script></scr'+'ipt></script>X" +| <body> + +#data +<style><!--<style></style>--></style> +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +Line: 1 Col: 37 Unexpected end tag (style). +#document +| <html> +| <head> +| <style> +| "<!--<style>" +| <body> +| "-->" + +#data +<style><!--</style>X +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| <html> +| <head> +| <style> +| "<!--" +| <body> +| "X" + +#data +<style><!--...</style>...--></style> +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +Line: 1 Col: 36 Unexpected end tag (style). +#document +| <html> +| <head> +| <style> +| "<!--..." +| <body> +| "...-->" + +#data +<style><!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style></style>X +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| <html> +| <head> +| <style> +| "<!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style>" +| <body> +| "X" + +#data +<style><!--...<style><!--...--!></style>--></style> +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +Line: 1 Col: 51 Unexpected end tag (style). +#document +| <html> +| <head> +| <style> +| "<!--...<style><!--...--!>" +| <body> +| "-->" + +#data +<style><!--...</style><!-- --><style>@import ...</style> +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| <html> +| <head> +| <style> +| "<!--..." +| <!-- --> +| <style> +| "@import ..." +| <body> + +#data +<style>...<style><!--...</style><!-- --></style> +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +Line: 1 Col: 48 Unexpected end tag (style). +#document +| <html> +| <head> +| <style> +| "...<style><!--..." +| <!-- --> +| <body> + +#data +<style>...<!--[if IE]><style>...</style>X +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| <html> +| <head> +| <style> +| "...<!--[if IE]><style>..." +| <body> +| "X" + +#data +<title><!--<title>--> +#errors +Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. +Line: 1 Col: 37 Unexpected end tag (title). +#document +| +| +| +| "<!--<title>" +| <body> +| "-->" + +#data +<title></title> +#errors +Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. +#document +| +| +| +| "" +| + +#data +foo/title><link></head><body>X +#errors +Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. +Line: 1 Col: 37 Unexpected end of file. Expected end tag (title). +#document +| <html> +| <head> +| <title> +| "foo/title><link></head><body>X" +| <body> + +#data +<noscript><!--<noscript></noscript>--></noscript> +#errors +Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. +Line: 1 Col: 49 Unexpected end tag (noscript). +#document +| <html> +| <head> +| <noscript> +| "<!--<noscript>" +| <body> +| "-->" + +#data +<noscript><!--</noscript>X<noscript>--></noscript> +#errors +Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. +#document +| <html> +| <head> +| <noscript> +| "<!--" +| <body> +| "X" +| <noscript> +| "-->" + +#data +<noscript><iframe></noscript>X +#errors +Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. +#document +| <html> +| <head> +| <noscript> +| "<iframe>" +| <body> +| "X" + +#data +<noframes><!--<noframes></noframes>--></noframes> +#errors +Line: 1 Col: 10 Unexpected start tag (noframes). Expected DOCTYPE. +Line: 1 Col: 49 Unexpected end tag (noframes). +#document +| <html> +| <head> +| <noframes> +| "<!--<noframes>" +| <body> +| "-->" + +#data +<noframes><body><script><!--...</script></body></noframes></html> +#errors +Line: 1 Col: 10 Unexpected start tag (noframes). Expected DOCTYPE. +#document +| <html> +| <head> +| <noframes> +| "<body><script><!--...</script></body>" +| <body> + +#data +<textarea><!--<textarea></textarea>--></textarea> +#errors +Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. +Line: 1 Col: 49 Unexpected end tag (textarea). +#document +| <html> +| <head> +| <body> +| <textarea> +| "<!--<textarea>" +| "-->" + +#data +<textarea></textarea></textarea> +#errors +Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| <textarea> +| "</textarea>" + +#data +<iframe><!--<iframe></iframe>--></iframe> +#errors +Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE. +Line: 1 Col: 41 Unexpected end tag (iframe). +#document +| <html> +| <head> +| <body> +| <iframe> +| "<!--<iframe>" +| "-->" + +#data +<iframe>...<!--X->...<!--/X->...</iframe> +#errors +Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| <iframe> +| "...<!--X->...<!--/X->..." + +#data +<xmp><!--<xmp></xmp>--></xmp> +#errors +Line: 1 Col: 5 Unexpected start tag (xmp). Expected DOCTYPE. +Line: 1 Col: 29 Unexpected end tag (xmp). +#document +| <html> +| <head> +| <body> +| <xmp> +| "<!--<xmp>" +| "-->" + +#data +<noembed><!--<noembed></noembed>--></noembed> +#errors +Line: 1 Col: 9 Unexpected start tag (noembed). Expected DOCTYPE. +Line: 1 Col: 45 Unexpected end tag (noembed). +#document +| <html> +| <head> +| <body> +| <noembed> +| "<!--<noembed>" +| "-->" + +#data +<!doctype html><table> + +#errors +Line 2 Col 0 Unexpected end of file. Expected table content. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| " +" + +#data +<!doctype html><table><td><span><font></span><span> +#errors +Line 1 Col 26 Unexpected table cell start tag (td) in the table body phase. +Line 1 Col 45 Unexpected end tag (span). +Line 1 Col 51 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <span> +| <font> +| <font> +| <span> + +#data +<!doctype html><form><table></form><form></table></form> +#errors +35: Stray end tag “form”. +41: Start tag “form” seen in “table”. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> +| <table> +| <form> diff --git a/html5lib/tests/testdata/tree-construction/tests17.dat b/html5lib/tests/testdata/tree-construction/tests17.dat new file mode 100644 index 00000000..7b555f88 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests17.dat @@ -0,0 +1,153 @@ +#data +<!doctype html><table><tbody><select><tr> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><table><tr><select><td> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <table> +| <tbody> +| <tr> +| <td> + +#data +<!doctype html><table><tr><td><select><td> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <td> + +#data +<!doctype html><table><tr><th><select><td> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <th> +| <select> +| <td> + +#data +<!doctype html><table><caption><select><tr> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <caption> +| <select> +| <tbody> +| <tr> + +#data +<!doctype html><select><tr> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><td> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><th> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><tbody> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><thead> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><tfoot> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><select><caption> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><table><tr></table>a +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| "a" diff --git a/html5lib/tests/testdata/tree-construction/tests18.dat b/html5lib/tests/testdata/tree-construction/tests18.dat new file mode 100644 index 00000000..680e1f06 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests18.dat @@ -0,0 +1,269 @@ +#data +<!doctype html><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" + +#data +<!doctype html><table><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" +| <table> + +#data +<!doctype html><table><tbody><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" +| <table> +| <tbody> + +#data +<!doctype html><table><tbody><tr><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><table><tbody><tr><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><table><td><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <plaintext> +| "</plaintext>" + +#data +<!doctype html><table><caption><plaintext></plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <caption> +| <plaintext> +| "</plaintext>" + +#data +<!doctype html><table><tr><style></script></style>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "abc" +| <table> +| <tbody> +| <tr> +| <style> +| "</script>" + +#data +<!doctype html><table><tr><script></style></script>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "abc" +| <table> +| <tbody> +| <tr> +| <script> +| "</style>" + +#data +<!doctype html><table><caption><style></script></style>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <caption> +| <style> +| "</script>" +| "abc" + +#data +<!doctype html><table><td><style></script></style>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <style> +| "</script>" +| "abc" + +#data +<!doctype html><select><script></style></script>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <script> +| "</style>" +| "abc" + +#data +<!doctype html><table><select><script></style></script>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <script> +| "</style>" +| "abc" +| <table> + +#data +<!doctype html><table><tr><select><script></style></script>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <script> +| "</style>" +| "abc" +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><frameset></frameset><noframes>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <noframes> +| "abc" + +#data +<!doctype html><frameset></frameset><noframes>abc</noframes><!--abc--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <noframes> +| "abc" +| <!-- abc --> + +#data +<!doctype html><frameset></frameset></html><noframes>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <noframes> +| "abc" + +#data +<!doctype html><frameset></frameset></html><noframes>abc</noframes><!--abc--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <noframes> +| "abc" +| <!-- abc --> + +#data +<!doctype html><table><tr></tbody><tfoot> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <tfoot> + +#data +<!doctype html><table><td><svg></svg>abc<td> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <svg svg> +| "abc" +| <td> diff --git a/html5lib/tests/testdata/tree-construction/tests19.dat b/html5lib/tests/testdata/tree-construction/tests19.dat new file mode 100644 index 00000000..0d62f5a5 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests19.dat @@ -0,0 +1,1237 @@ +#data +<!doctype html><math><mn DefinitionUrl="foo"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <math math> +| <math mn> +| definitionURL="foo" + +#data +<!doctype html><html></p><!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <!-- foo --> +| <head> +| <body> + +#data +<!doctype html><head></head></p><!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <!-- foo --> +| <body> + +#data +<!doctype html><body><p><pre> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <pre> + +#data +<!doctype html><body><p><listing> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <listing> + +#data +<!doctype html><p><plaintext> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <plaintext> + +#data +<!doctype html><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <h1> + +#data +<!doctype html><form><isindex> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> + +#data +<!doctype html><isindex action="POST"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> +| action="POST" +| <hr> +| <label> +| "This is a searchable index. Enter search keywords: " +| <input> +| name="isindex" +| <hr> + +#data +<!doctype html><isindex prompt="this is isindex"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> +| <hr> +| <label> +| "this is isindex" +| <input> +| name="isindex" +| <hr> + +#data +<!doctype html><isindex type="hidden"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> +| <hr> +| <label> +| "This is a searchable index. Enter search keywords: " +| <input> +| name="isindex" +| type="hidden" +| <hr> + +#data +<!doctype html><isindex name="foo"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <form> +| <hr> +| <label> +| "This is a searchable index. Enter search keywords: " +| <input> +| name="isindex" +| <hr> + +#data +<!doctype html><ruby><p><rp> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <p> +| <rp> + +#data +<!doctype html><ruby><div><span><rp> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <span> +| <rp> + +#data +<!doctype html><ruby><div><p><rp> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <p> +| <rp> + +#data +<!doctype html><ruby><p><rt> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <p> +| <rt> + +#data +<!doctype html><ruby><div><span><rt> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <span> +| <rt> + +#data +<!doctype html><ruby><div><p><rt> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <ruby> +| <div> +| <p> +| <rt> + +#data +<!doctype html><math/><foo> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <math math> +| <foo> + +#data +<!doctype html><svg/><foo> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <svg svg> +| <foo> + +#data +<!doctype html><div></body><!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <div> +| <!-- foo --> + +#data +<!doctype html><h1><div><h3><span></h1>foo +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <h1> +| <div> +| <h3> +| <span> +| "foo" + +#data +<!doctype html><p></h3>foo +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| "foo" + +#data +<!doctype html><h3><li>abc</h2>foo +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <h3> +| <li> +| "abc" +| "foo" + +#data +<!doctype html><table>abc<!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "abc" +| <table> +| <!-- foo --> + +#data +<!doctype html><table> <!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| " " +| <!-- foo --> + +#data +<!doctype html><table> b <!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| " b " +| <table> +| <!-- foo --> + +#data +<!doctype html><select><option><option> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <option> +| <option> + +#data +<!doctype html><select><option></optgroup> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <option> + +#data +<!doctype html><select><option></optgroup> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <option> + +#data +<!doctype html><p><math><mi><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math mi> +| <p> +| <h1> + +#data +<!doctype html><p><math><mo><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math mo> +| <p> +| <h1> + +#data +<!doctype html><p><math><mn><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math mn> +| <p> +| <h1> + +#data +<!doctype html><p><math><ms><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math ms> +| <p> +| <h1> + +#data +<!doctype html><p><math><mtext><p><h1> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math mtext> +| <p> +| <h1> + +#data +<!doctype html><frameset></noframes> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!doctype html><html c=d><body></html><html a=b> +#errors +#document +| <!DOCTYPE html> +| <html> +| a="b" +| c="d" +| <head> +| <body> + +#data +<!doctype html><html c=d><frameset></frameset></html><html a=b> +#errors +#document +| <!DOCTYPE html> +| <html> +| a="b" +| c="d" +| <head> +| <frameset> + +#data +<!doctype html><html><frameset></frameset></html><!--foo--> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <!-- foo --> + +#data +<!doctype html><html><frameset></frameset></html> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| " " + +#data +<!doctype html><html><frameset></frameset></html>abc +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!doctype html><html><frameset></frameset></html><p> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!doctype html><html><frameset></frameset></html></p> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<html><frameset></frameset></html><!doctype html> +#errors +#document +| <html> +| <head> +| <frameset> + +#data +<!doctype html><body><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> + +#data +<!doctype html><p><frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<!doctype html><p>a<frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| "a" + +#data +<!doctype html><p> <frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<!doctype html><pre><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <pre> + +#data +<!doctype html><listing><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <listing> + +#data +<!doctype html><li><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <li> + +#data +<!doctype html><dd><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <dd> + +#data +<!doctype html><dt><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <dt> + +#data +<!doctype html><button><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <button> + +#data +<!doctype html><applet><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <applet> + +#data +<!doctype html><marquee><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <marquee> + +#data +<!doctype html><object><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <object> + +#data +<!doctype html><table><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> + +#data +<!doctype html><area><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <area> + +#data +<!doctype html><basefont><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <basefont> +| <frameset> + +#data +<!doctype html><bgsound><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <bgsound> +| <frameset> + +#data +<!doctype html><br><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <br> + +#data +<!doctype html><embed><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <embed> + +#data +<!doctype html><img><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <img> + +#data +<!doctype html><input><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <input> + +#data +<!doctype html><keygen><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <keygen> + +#data +<!doctype html><wbr><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <wbr> + +#data +<!doctype html><hr><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <hr> + +#data +<!doctype html><textarea></textarea><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <textarea> + +#data +<!doctype html><xmp></xmp><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <xmp> + +#data +<!doctype html><iframe></iframe><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <iframe> + +#data +<!doctype html><select></select><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> + +#data +<!doctype html><svg></svg><frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<!doctype html><math></math><frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<!doctype html><svg><foreignObject><div> <frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<!doctype html><svg>a</svg><frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <svg svg> +| "a" + +#data +<!doctype html><svg> </svg><frameset><frame> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> +| <frame> + +#data +<html>aaa<frameset></frameset> +#errors +#document +| <html> +| <head> +| <body> +| "aaa" + +#data +<html> a <frameset></frameset> +#errors +#document +| <html> +| <head> +| <body> +| "a " + +#data +<!doctype html><div><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!doctype html><div><body><frameset> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <div> + +#data +<!doctype html><p><math></p>a +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| "a" + +#data +<!doctype html><p><math><mn><span></p>a +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <math math> +| <math mn> +| <span> +| <p> +| "a" + +#data +<!doctype html><math></html> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <math math> + +#data +<!doctype html><meta charset="ascii"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <meta> +| charset="ascii" +| <body> + +#data +<!doctype html><meta http-equiv="content-type" content="text/html;charset=ascii"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <meta> +| content="text/html;charset=ascii" +| http-equiv="content-type" +| <body> + +#data +<!doctype html><head><!--aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa--><meta charset="utf8"> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <!-- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa --> +| <meta> +| charset="utf8" +| <body> + +#data +<!doctype html><html a=b><head></head><html c=d> +#errors +#document +| <!DOCTYPE html> +| <html> +| a="b" +| c="d" +| <head> +| <body> + +#data +<!doctype html><image/> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <img> + +#data +<!doctype html>a<i>b<table>c<b>d</i>e</b>f +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "a" +| <i> +| "bc" +| <b> +| "de" +| "f" +| <table> + +#data +<!doctype html><table><i>a<b>b<div>c<a>d</i>e</b>f +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <i> +| "a" +| <b> +| "b" +| <b> +| <div> +| <b> +| <i> +| "c" +| <a> +| "d" +| <a> +| "e" +| <a> +| "f" +| <table> + +#data +<!doctype html><i>a<b>b<div>c<a>d</i>e</b>f +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <i> +| "a" +| <b> +| "b" +| <b> +| <div> +| <b> +| <i> +| "c" +| <a> +| "d" +| <a> +| "e" +| <a> +| "f" + +#data +<!doctype html><table><i>a<b>b<div>c</i> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <i> +| "a" +| <b> +| "b" +| <b> +| <div> +| <i> +| "c" +| <table> + +#data +<!doctype html><table><i>a<b>b<div>c<a>d</i>e</b>f +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <i> +| "a" +| <b> +| "b" +| <b> +| <div> +| <b> +| <i> +| "c" +| <a> +| "d" +| <a> +| "e" +| <a> +| "f" +| <table> + +#data +<!doctype html><table><i>a<div>b<tr>c<b>d</i>e +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <i> +| "a" +| <div> +| "b" +| <i> +| "c" +| <b> +| "d" +| <b> +| "e" +| <table> +| <tbody> +| <tr> + +#data +<!doctype html><table><td><table><i>a<div>b<b>c</i>d +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <i> +| "a" +| <div> +| <i> +| "b" +| <b> +| "c" +| <b> +| "d" +| <table> + +#data +<!doctype html><body><bgsound> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <bgsound> + +#data +<!doctype html><body><basefont> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <basefont> + +#data +<!doctype html><a><b></a><basefont> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <a> +| <b> +| <basefont> + +#data +<!doctype html><a><b></a><bgsound> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <a> +| <b> +| <bgsound> + +#data +<!doctype html><figcaption><article></figcaption>a +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <figcaption> +| <article> +| "a" + +#data +<!doctype html><summary><article></summary>a +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <summary> +| <article> +| "a" + +#data +<!doctype html><p><a><plaintext>b +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <a> +| <plaintext> +| <a> +| "b" + +#data +<!DOCTYPE html><div>a<a></div>b<p>c</p>d +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <div> +| "a" +| <a> +| <a> +| "b" +| <p> +| "c" +| "d" diff --git a/html5lib/tests/testdata/tree-construction/tests2.dat b/html5lib/tests/testdata/tree-construction/tests2.dat new file mode 100644 index 00000000..60d85922 --- /dev/null +++ b/html5lib/tests/testdata/tree-construction/tests2.dat @@ -0,0 +1,763 @@ +#data +<!DOCTYPE html>Test +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "Test" + +#data +<textarea>test</div>test +#errors +Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. +Line: 1 Col: 24 Expected closing tag. Unexpected end of file. +#document +| <html> +| <head> +| <body> +| <textarea> +| "test</div>test" + +#data +<table><td> +#errors +Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase. +Line: 1 Col: 11 Expected closing tag. Unexpected end of file. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> + +#data +<table><td>test</tbody></table> +#errors +Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| "test" + +#data +<frame>test +#errors +Line: 1 Col: 7 Unexpected start tag (frame). Expected DOCTYPE. +Line: 1 Col: 7 Unexpected start tag frame. Ignored. +#document +| <html> +| <head> +| <body> +| "test" + +#data +<!DOCTYPE html><frameset>test +#errors +Line: 1 Col: 29 Unepxected characters in the frameset phase. Characters ignored. +Line: 1 Col: 29 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!DOCTYPE html><frameset><!DOCTYPE html> +#errors +Line: 1 Col: 40 Unexpected DOCTYPE. Ignored. +Line: 1 Col: 40 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <frameset> + +#data +<!DOCTYPE html><font><p><b>test</font> +#errors +Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm. +Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <font> +| <p> +| <font> +| <b> +| "test" + +#data +<!DOCTYPE html><dt><div><dd> +#errors +Line: 1 Col: 28 Missing end tag (div, dt). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <dt> +| <div> +| <dd> + +#data +<script></x +#errors +Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. +Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). +#document +| <html> +| <head> +| <script> +| "</x" +| <body> + +#data +<table><plaintext><td> +#errors +Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. +Line: 1 Col: 18 Unexpected start tag (plaintext) in table context caused voodoo mode. +Line: 1 Col: 22 Unexpected end of file. Expected table content. +#document +| <html> +| <head> +| <body> +| <plaintext> +| "<td>" +| <table> + +#data +<plaintext></plaintext> +#errors +Line: 1 Col: 11 Unexpected start tag (plaintext). Expected DOCTYPE. +Line: 1 Col: 23 Expected closing tag. Unexpected end of file. +#document +| <html> +| <head> +| <body> +| <plaintext> +| "</plaintext>" + +#data +<!DOCTYPE html><table><tr>TEST +#errors +Line: 1 Col: 30 Unexpected non-space characters in table context caused voodoo mode. +Line: 1 Col: 30 Unexpected end of file. Expected table content. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "TEST" +| <table> +| <tbody> +| <tr> + +#data +<!DOCTYPE html><body t1=1><body t2=2><body t3=3 t4=4> +#errors +Line: 1 Col: 37 Unexpected start tag (body). +Line: 1 Col: 53 Unexpected start tag (body). +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| t1="1" +| t2="2" +| t3="3" +| t4="4" + +#data +</b test +#errors +Line: 1 Col: 8 Unexpected end of file in attribute name. +Line: 1 Col: 8 End tag contains unexpected attributes. +Line: 1 Col: 8 Unexpected end tag (b). Expected DOCTYPE. +Line: 1 Col: 8 Unexpected end tag (b) after the (implied) root element. +#document +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html></b test<b &=&>X +#errors +Line: 1 Col: 32 Named entity didn't end with ';'. +Line: 1 Col: 33 End tag contains unexpected attributes. +Line: 1 Col: 33 Unexpected end tag (b) after the (implied) root element. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "X" + +#data +<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt +#errors +Line: 1 Col: 9 No space after literal string 'DOCTYPE'. +Line: 1 Col: 54 Unexpected end of file in the tag name. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <script> +| type="text/x-foobar;baz" +| "X</SCRipt" +| <body> + +#data +& +#errors +Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&" + +#data +&# +#errors +Line: 1 Col: 1 Numeric entity expected. Got end of file instead. +Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&#" + +#data +&#X +#errors +Line: 1 Col: 3 Numeric entity expected but none found. +Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&#X" + +#data +&#x +#errors +Line: 1 Col: 3 Numeric entity expected but none found. +Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&#x" + +#data +- +#errors +Line: 1 Col: 4 Numeric entity didn't end with ';'. +Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "-" + +#data +&x-test +#errors +Line: 1 Col: 1 Named entity expected. Got none. +Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&x-test" + +#data +<!doctypehtml><p><li> +#errors +Line: 1 Col: 9 No space after literal string 'DOCTYPE'. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <li> + +#data +<!doctypehtml><p><dt> +#errors +Line: 1 Col: 9 No space after literal string 'DOCTYPE'. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <dt> + +#data +<!doctypehtml><p><dd> +#errors +Line: 1 Col: 9 No space after literal string 'DOCTYPE'. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <dd> + +#data +<!doctypehtml><p><form> +#errors +Line: 1 Col: 9 No space after literal string 'DOCTYPE'. +Line: 1 Col: 23 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <form> + +#data +<!DOCTYPE html><p></P>X +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| "X" + +#data +& +#errors +Line: 1 Col: 4 Named entity didn't end with ';'. +Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&" + +#data +&AMp; +#errors +Line: 1 Col: 1 Named entity expected. Got none. +Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "&AMp;" + +#data +<!DOCTYPE html><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY> +#errors +Line: 1 Col: 110 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly> + +#data +<!DOCTYPE html>X</body>X +#errors +Line: 1 Col: 24 Unexpected non-space characters in the after body phase. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "XX" + +#data +<!DOCTYPE html><!-- X +#errors +Line: 1 Col: 21 Unexpected end of file in comment. +#document +| <!DOCTYPE html> +| <!-- X --> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html><table><caption>test TEST</caption><td>test +#errors +Line: 1 Col: 54 Unexpected table cell start tag (td) in the table body phase. +Line: 1 Col: 58 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <table> +| <caption> +| "test TEST" +| <tbody> +| <tr> +| <td> +| "test" + +#data +<!DOCTYPE html><select><option><optgroup> +#errors +Line: 1 Col: 41 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <option> +| <optgroup> + +#data +<!DOCTYPE html><select><optgroup><option></optgroup><option><select><option> +#errors +Line: 1 Col: 68 Unexpected select start tag in the select phase treated as select end tag. +Line: 1 Col: 76 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <optgroup> +| <option> +| <option> +| <option> + +#data +<!DOCTYPE html><select><optgroup><option><optgroup> +#errors +Line: 1 Col: 51 Expected closing tag. Unexpected end of file. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <optgroup> +| <option> +| <optgroup> + +#data +<!DOCTYPE html><datalist><option>foo</datalist>bar +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <datalist> +| <option> +| "foo" +| "bar" + +#data +<!DOCTYPE html><font><input><input></font> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <font> +| <input> +| <input> + +#data +<!DOCTYPE html><!-- XXX - XXX --> +#errors +#document +| <!DOCTYPE html> +| <!-- XXX - XXX --> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html><!-- XXX - XXX +#errors +Line: 1 Col: 29 Unexpected end of file in comment (-) +#document +| <!DOCTYPE html> +| <!-- XXX - XXX --> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html><!-- XXX - XXX - XXX --> +#errors +#document +| <!DOCTYPE html> +| <!-- XXX - XXX - XXX --> +| <html> +| <head> +| <body> + +#data +<isindex test=x name=x> +#errors +Line: 1 Col: 23 Unexpected start tag (isindex). Expected DOCTYPE. +Line: 1 Col: 23 Unexpected start tag isindex. Don't use it! +#document +| <html> +| <head> +| <body> +| <form> +| <hr> +| <label> +| "This is a searchable index. Enter search keywords: " +| <input> +| name="isindex" +| test="x" +| <hr> + +#data +test +test +#errors +Line: 2 Col: 4 Unexpected non-space characters. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> +| "test +test" + +#data +<!DOCTYPE html><body><title>test</body> +#errors +#document +| +| +| +| +| +| "test</body>" + +#data +<!DOCTYPE html><body><title>X +#errors +#document +| +| +| +| +| +| "X" +| <meta> +| name="z" +| <link> +| rel="foo" +| <style> +| " +x { content:"</style" } " + +#data +<!DOCTYPE html><select><optgroup></optgroup></select> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <select> +| <optgroup> + +#data + + +#errors +Line: 2 Col: 1 Unexpected End of file. Expected DOCTYPE. +#document +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html> <html> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> + +#data +<!DOCTYPE html><script> +</script> <title>x +#errors +#document +| +| +| +| +#errors +Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. +Line: 1 Col: 21 Unexpected start tag (script) that can be in head. Moved. +#document +| +| +| +#errors +Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. +Line: 1 Col: 28 Unexpected start tag (style) that can be in head. Moved. +#document +| +| +| +#errors +Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. +#document +| +| +| +| +| "x" +| x +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +Line: 1 Col: 22 Unexpected end of file. Expected end tag (style). +#document +| +| +| --> x +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| +| +| x +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| +| +| x +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| +| +| x +#errors +Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. +#document +| +| +|

      +#errors +#document +| +| +| +| +| +| ddd +#errors +#document +| +| +| +#errors +#document +| +| +| +| +|
    • +| +|