mirror of
https://github.com/rembo10/headphones.git
synced 2026-03-25 06:09:26 +00:00
Merge remote-tracking branch 'AdeHub/master' into develop
This commit is contained in:
359
bs4/__init__.py
Normal file
359
bs4/__init__.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides provides methods and Pythonic idioms that make it easy to
|
||||
navigate, search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.1.3"
|
||||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
# Used when determining whether a text node is all whitespace and
|
||||
# can be replaced with a single space. A text node that contains
|
||||
# fancy Unicode spaces (usually non-breaking) should be left
|
||||
# alone.
|
||||
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, **kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. You can pass in features='html' "
|
||||
"or features='xml' to get a builder capable of handling "
|
||||
"one or the other.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.reset()
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
(self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) = (
|
||||
self.builder.prepare_markup(markup, from_encoding))
|
||||
|
||||
try:
|
||||
self._feed()
|
||||
except StopParsing:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.currentData = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
navigable = NavigableString(s)
|
||||
navigable.setup()
|
||||
return navigable
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.currentData:
|
||||
currentData = u''.join(self.currentData)
|
||||
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
|
||||
not set([tag.name for tag in self.tagStack]).intersection(
|
||||
self.builder.preserve_whitespace_tags)):
|
||||
if '\n' in currentData:
|
||||
currentData = '\n'
|
||||
else:
|
||||
currentData = ' '
|
||||
self.currentData = []
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(currentData)):
|
||||
return
|
||||
o = containerClass(currentData)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o):
|
||||
"""Add an object to the parse tree."""
|
||||
o.setup(self.currentTag, self.previous_element)
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = o
|
||||
self.previous_element = o
|
||||
self.currentTag.contents.append(o)
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
return
|
||||
|
||||
numPops = 0
|
||||
mostRecentTag = None
|
||||
|
||||
for i in range(len(self.tagStack) - 1, 0, -1):
|
||||
if (name == self.tagStack[i].name
|
||||
and nsprefix == self.tagStack[i].prefix):
|
||||
numPops = len(self.tagStack) - i
|
||||
break
|
||||
if not inclusivePop:
|
||||
numPops = numPops - 1
|
||||
|
||||
for i in range(0, numPops):
|
||||
mostRecentTag = self.popTag()
|
||||
return mostRecentTag
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occured
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self.previous_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self.previous_element:
|
||||
self.previous_element.next_element = tag
|
||||
self.previous_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.currentData.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
316
bs4/builder/__init__.py
Normal file
316
bs4/builder/__init__.py
Normal file
@@ -0,0 +1,316 @@
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), [])
|
||||
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
||||
if cdata_list_attr in dict(attrs):
|
||||
# Basically, we have a "class" attribute whose
|
||||
# value is a whitespace-separated list of CSS
|
||||
# classes. Split it into a list.
|
||||
value = attrs[cdata_list_attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[cdata_list_attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||
'spacer', 'link', 'frame', 'base'])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
222
bs4/builder/_html5lib.py
Normal file
222
bs4/builder/_html5lib.py
Normal file
@@ -0,0 +1,222 @@
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import NamespacedAttribute
|
||||
import html5lib
|
||||
from html5lib.constants import namespaces
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
return markup, None, None, False
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
self.soup, namespaceHTMLElements)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||
|
||||
def __init__(self, soup, namespaceHTMLElements):
|
||||
self.soup = soup
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(html5lib.treebuilders._base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# Concatenate new text onto old text node
|
||||
# XXX This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + node.element)
|
||||
old_element.replace_with(new_element)
|
||||
else:
|
||||
self.element.append(node.element)
|
||||
node.parent = self
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(
|
||||
Element(child, self.soup, namespaces["html"]))
|
||||
else:
|
||||
newParent.appendChild(
|
||||
TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
244
bs4/builder/_htmlparser.py
Normal file
244
bs4/builder/_htmlparser.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = (
|
||||
major > 3
|
||||
or (major == 3 and minor > 2)
|
||||
or (major == 3 and minor == 2 and release >= 3))
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
def handle_starttag(self, name, attrs):
|
||||
# XXX namespace
|
||||
self.soup.handle_starttag(name, None, None, dict(attrs))
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed.
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
if data.endswith("?") and data.lower().startswith("xml"):
|
||||
# "An XHTML processing instruction using the trailing '?'
|
||||
# will cause the '?' to be included in data." - HTMLParser
|
||||
# docs.
|
||||
#
|
||||
# Strip the question mark so we don't end up with two
|
||||
# question marks.
|
||||
data = data[:-1]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
features = [HTML, STRICT, HTMLPARSER]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT:
|
||||
kwargs['strict'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
199
bs4/builder/_lxml.py
Normal file
199
bs4/builder/_lxml.py
Normal file
@@ -0,0 +1,199 @@
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import UnicodeDammit
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
if parser is None:
|
||||
# Use the default parser.
|
||||
parser = self.default_parser
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False)
|
||||
self.parser = parser
|
||||
self.soup = None
|
||||
self.nsmaps = None
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 3-tuple (markup, original encoding, encoding
|
||||
declared within markup).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
return markup, None, None, False
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
return (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, basestring):
|
||||
markup = StringIO(markup)
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
self.parser.feed(data)
|
||||
while data != '':
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if data != '':
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = None
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(nsmap) == 0 and self.nsmaps != None:
|
||||
# There are no new namespaces for this tag, but namespaces
|
||||
# are in play, so we need a separate tag stack to know
|
||||
# when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
if self.nsmaps is None:
|
||||
self.nsmaps = []
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
if self.nsmaps is not None and len(self.nsmaps) > 0:
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if self.nsmaps != None:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
if len(self.nsmaps) == 0:
|
||||
# Namespaces are no longer in play, so don't bother keeping
|
||||
# track of the namespace stack.
|
||||
self.nsmaps = None
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
803
bs4/dammit.py
Normal file
803
bs4/dammit.py
Normal file
@@ -0,0 +1,803 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This class forces XML data into a standard format (usually to UTF-8 or
|
||||
Unicode). It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It does not rewrite the XML or HTML to reflect a new
|
||||
encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False):
|
||||
self.declared_html_encoding = None
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
|
||||
if markup == '' or isinstance(markup, unicode):
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
new_markup, document_encoding, sniffed_encoding = \
|
||||
self._detectEncoding(markup, is_html)
|
||||
self.markup = new_markup
|
||||
|
||||
u = None
|
||||
if new_markup != markup:
|
||||
# _detectEncoding modified the markup, then converted it to
|
||||
# Unicode and then to UTF-8. So convert it from UTF-8.
|
||||
u = self._convert_from("utf8")
|
||||
self.original_encoding = sniffed_encoding
|
||||
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [document_encoding, sniffed_encoding]):
|
||||
if proposed_encoding is not None:
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# If no luck and we have auto-detection library, try that:
|
||||
if not u and not isinstance(self.markup, unicode):
|
||||
u = self._convert_from(chardet_dammit(self.markup))
|
||||
|
||||
# As a last resort, try utf-8 and windows-1252:
|
||||
if not u:
|
||||
for proposed_encoding in ("utf-8", "windows-1252"):
|
||||
u = self._convert_from(proposed_encoding)
|
||||
if u:
|
||||
break
|
||||
|
||||
# As an absolute last resort, try the encodings again with
|
||||
# character replacement.
|
||||
if not u:
|
||||
for proposed_encoding in (
|
||||
override_encodings + [
|
||||
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
|
||||
if proposed_encoding != "ascii":
|
||||
u = self._convert_from(proposed_encoding, "replace")
|
||||
if u is not None:
|
||||
logging.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER.")
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# We could at this point force it to ASCII, but that would
|
||||
# destroy so much data that I think giving up is better
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
|
||||
# strip Byte Order Mark (if present)
|
||||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == '\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == '\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == '\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
newdata = unicode(data, encoding, errors)
|
||||
return newdata
|
||||
|
||||
def _detectEncoding(self, xml_data, is_html=False):
|
||||
"""Given a document, tries to detect its XML encoding."""
|
||||
xml_encoding = sniffed_xml_encoding = None
|
||||
try:
|
||||
if xml_data[:4] == b'\x4c\x6f\xa7\x94':
|
||||
# EBCDIC
|
||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||
elif xml_data[:4] == b'\x00\x3c\x00\x3f':
|
||||
# UTF-16BE
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
|
||||
and (xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16BE with BOM
|
||||
sniffed_xml_encoding = 'utf-16be'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x3f\x00':
|
||||
# UTF-16LE
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
||||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
|
||||
(xml_data[2:4] != b'\x00\x00'):
|
||||
# UTF-16LE with BOM
|
||||
sniffed_xml_encoding = 'utf-16le'
|
||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\x00\x3c':
|
||||
# UTF-32BE
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x3c\x00\x00\x00':
|
||||
# UTF-32LE
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:4] == b'\x00\x00\xfe\xff':
|
||||
# UTF-32BE with BOM
|
||||
sniffed_xml_encoding = 'utf-32be'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
||||
elif xml_data[:4] == b'\xff\xfe\x00\x00':
|
||||
# UTF-32LE with BOM
|
||||
sniffed_xml_encoding = 'utf-32le'
|
||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
||||
elif xml_data[:3] == b'\xef\xbb\xbf':
|
||||
# UTF-8 with BOM
|
||||
sniffed_xml_encoding = 'utf-8'
|
||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
||||
else:
|
||||
sniffed_xml_encoding = 'ascii'
|
||||
pass
|
||||
except:
|
||||
xml_encoding_match = None
|
||||
xml_encoding_match = xml_encoding_re.match(xml_data)
|
||||
if not xml_encoding_match and is_html:
|
||||
xml_encoding_match = html_meta_re.search(xml_data)
|
||||
if xml_encoding_match is not None:
|
||||
xml_encoding = xml_encoding_match.groups()[0].decode(
|
||||
'ascii').lower()
|
||||
if is_html:
|
||||
self.declared_html_encoding = xml_encoding
|
||||
if sniffed_xml_encoding and \
|
||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
|
||||
or (charset and self._codec(charset.replace("-", ""))) \
|
||||
or (charset and self._codec(charset.replace("-", "_"))) \
|
||||
or charset
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
EBCDIC_TO_ASCII_MAP = None
|
||||
|
||||
def _ebcdic_to_ascii(self, s):
|
||||
c = self.__class__
|
||||
if not c.EBCDIC_TO_ASCII_MAP:
|
||||
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
|
||||
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
|
||||
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
|
||||
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
|
||||
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
|
||||
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
|
||||
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
|
||||
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
|
||||
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
|
||||
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
|
||||
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
|
||||
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
|
||||
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
|
||||
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
|
||||
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
|
||||
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
|
||||
250,251,252,253,254,255)
|
||||
import string
|
||||
c.EBCDIC_TO_ASCII_MAP = string.maketrans(
|
||||
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
|
||||
return s.translate(c.EBCDIC_TO_ASCII_MAP)
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
||||
1355
bs4/element.py
Normal file
1355
bs4/element.py
Normal file
File diff suppressed because it is too large
Load Diff
537
bs4/testing.py
Normal file
537
bs4/testing.py
Normal file
@@ -0,0 +1,537 @@
|
||||
"""Helper classes for tests."""
|
||||
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||
self.assertSoupEquals("<pre> </pre>")
|
||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
@@ -36,7 +36,7 @@
|
||||
</div>
|
||||
% elif headphones.CURRENT_VERSION != headphones.LATEST_VERSION and headphones.INSTALL_TYPE != 'win':
|
||||
<div id="updatebar">
|
||||
A <a href="http://github.com/rembo10/headphones/compare/${headphones.CURRENT_VERSION}...${headphones.LATEST_VERSION}"> newer version</a> is available. You're ${headphones.COMMITS_BEHIND} commits behind. <a href="update">Update</a> or <a href="#" onclick="$('#updatebar').slideUp('slow');">Close</a>
|
||||
A <a href="https://github.com/AdeHub/headphones/compare/${headphones.CURRENT_VERSION}...${headphones.LATEST_VERSION}"> newer version</a> is available. You're ${headphones.COMMITS_BEHIND} commits behind. <a href="update">Update</a> or <a href="#" onclick="$('#updatebar').slideUp('slow');">Close</a>
|
||||
</div>
|
||||
% endif
|
||||
|
||||
|
||||
@@ -302,6 +302,19 @@ m<%inherit file="base.html"/>
|
||||
<input type="text" name="waffles_passkey" value="${config['waffles_passkey']}" size="36">
|
||||
</div>
|
||||
</div>
|
||||
<div class="row checkbox">
|
||||
<input id="userutracker" type="checkbox" name="rutracker" onclick="initConfigCheckbox($(this));" value="1" ${config['use_rutracker']} /><label>rutracker.org</label>
|
||||
</div>
|
||||
<div class="config">
|
||||
<div class="row">
|
||||
<label>rutracker User Name: </label>
|
||||
<input type="text" name="rutracker_user" value="${config['rutracker_user']}" size="36">
|
||||
</div>
|
||||
<div class="row">
|
||||
<label>rutracker Password: </label>
|
||||
<input type="password" name="rutracker_password" value="${config['rutracker_password']}" size="36">
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
</td>
|
||||
@@ -926,6 +939,7 @@ m<%inherit file="base.html"/>
|
||||
initConfigCheckbox("#usenewzbin");
|
||||
initConfigCheckbox("#usenzbsorg");
|
||||
initConfigCheckbox("#usewaffles");
|
||||
initConfigCheckbox("#userutracker");
|
||||
initConfigCheckbox("#useblackhole");
|
||||
initConfigCheckbox("#useapi");
|
||||
}
|
||||
|
||||
@@ -45,6 +45,8 @@
|
||||
fileid = 'nzb'
|
||||
if item['URL'].find('torrent') != -1:
|
||||
fileid = 'torrent'
|
||||
if item['URL'].find('rutracker') != -1:
|
||||
fileid = 'torrent'
|
||||
%>
|
||||
<tr class="grade${grade}">
|
||||
<td id="dateadded">${item['DateAdded']}</td>
|
||||
|
||||
@@ -154,6 +154,9 @@ MININOVA = None
|
||||
WAFFLES = None
|
||||
WAFFLES_UID = None
|
||||
WAFFLES_PASSKEY = None
|
||||
RUTRACKER = None
|
||||
RUTRACKER_USER = None
|
||||
RUTRACKER_PASSWORD = None
|
||||
DOWNLOAD_TORRENT_DIR = None
|
||||
|
||||
INTERFACE = None
|
||||
@@ -248,7 +251,7 @@ def initialize():
|
||||
LOSSLESS_DESTINATION_DIR, PREFERRED_QUALITY, PREFERRED_BITRATE, DETECT_BITRATE, ADD_ARTISTS, CORRECT_METADATA, MOVE_FILES, \
|
||||
RENAME_FILES, FOLDER_FORMAT, FILE_FORMAT, CLEANUP_FILES, INCLUDE_EXTRAS, EXTRAS, AUTOWANT_UPCOMING, AUTOWANT_ALL, \
|
||||
ADD_ALBUM_ART, EMBED_ALBUM_ART, EMBED_LYRICS, DOWNLOAD_DIR, BLACKHOLE, BLACKHOLE_DIR, USENET_RETENTION, SEARCH_INTERVAL, \
|
||||
TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, DOWNLOAD_TORRENT_DIR, \
|
||||
TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, RUTRACKER, RUTRACKER_USER, RUTRACKER_PASSWORD, DOWNLOAD_TORRENT_DIR, \
|
||||
LIBRARYSCAN_INTERVAL, DOWNLOAD_SCAN_INTERVAL, SAB_HOST, SAB_USERNAME, SAB_PASSWORD, SAB_APIKEY, SAB_CATEGORY, \
|
||||
NZBMATRIX, NZBMATRIX_USERNAME, NZBMATRIX_APIKEY, NEWZNAB, NEWZNAB_HOST, NEWZNAB_APIKEY, NEWZNAB_ENABLED, EXTRA_NEWZNABS,\
|
||||
NZBSORG, NZBSORG_UID, NZBSORG_HASH, NEWZBIN, NEWZBIN_UID, NEWZBIN_PASSWORD, LASTFM_USERNAME, INTERFACE, FOLDER_PERMISSIONS, \
|
||||
@@ -269,6 +272,7 @@ def initialize():
|
||||
CheckSection('NZBsorg')
|
||||
CheckSection('Newzbin')
|
||||
CheckSection('Waffles')
|
||||
CheckSection('Rutracker')
|
||||
CheckSection('Prowl')
|
||||
CheckSection('XBMC')
|
||||
CheckSection('NMA')
|
||||
@@ -342,6 +346,10 @@ def initialize():
|
||||
WAFFLES = bool(check_setting_int(CFG, 'Waffles', 'waffles', 0))
|
||||
WAFFLES_UID = check_setting_str(CFG, 'Waffles', 'waffles_uid', '')
|
||||
WAFFLES_PASSKEY = check_setting_str(CFG, 'Waffles', 'waffles_passkey', '')
|
||||
|
||||
RUTRACKER = bool(check_setting_int(CFG, 'Rutracker', 'rutracker', 0))
|
||||
RUTRACKER_USER = check_setting_str(CFG, 'Rutracker', 'rutracker_user', '')
|
||||
RUTRACKER_PASSWORD = check_setting_str(CFG, 'Rutracker', 'rutracker_password', '')
|
||||
|
||||
SAB_HOST = check_setting_str(CFG, 'SABnzbd', 'sab_host', '')
|
||||
SAB_USERNAME = check_setting_str(CFG, 'SABnzbd', 'sab_username', '')
|
||||
@@ -620,6 +628,11 @@ def config_write():
|
||||
new_config['Waffles']['waffles'] = int(WAFFLES)
|
||||
new_config['Waffles']['waffles_uid'] = WAFFLES_UID
|
||||
new_config['Waffles']['waffles_passkey'] = WAFFLES_PASSKEY
|
||||
|
||||
new_config['Rutracker'] = {}
|
||||
new_config['Rutracker']['rutracker'] = int(RUTRACKER)
|
||||
new_config['Rutracker']['rutracker_user'] = RUTRACKER_USER
|
||||
new_config['Rutracker']['rutracker_password'] = RUTRACKER_PASSWORD
|
||||
|
||||
new_config['General']['search_interval'] = SEARCH_INTERVAL
|
||||
new_config['General']['libraryscan_interval'] = LIBRARYSCAN_INTERVAL
|
||||
|
||||
@@ -28,6 +28,9 @@ from headphones import logger, db, helpers, classes, sab
|
||||
|
||||
import lib.bencode as bencode
|
||||
|
||||
import headphones.searcher_rutracker as rutrackersearch
|
||||
rutracker = rutrackersearch.Rutracker()
|
||||
|
||||
class NewzbinDownloader(urllib.FancyURLopener):
|
||||
|
||||
def __init__(self):
|
||||
@@ -97,7 +100,7 @@ def searchforalbum(albumid=None, new=False, lossless=False):
|
||||
else:
|
||||
foundNZB = searchNZB(result['AlbumID'], new)
|
||||
|
||||
if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none":
|
||||
if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none":
|
||||
if result['Status'] == "Wanted Lossless":
|
||||
searchTorrent(result['AlbumID'], new, losslessOnly=True)
|
||||
else:
|
||||
@@ -109,7 +112,7 @@ def searchforalbum(albumid=None, new=False, lossless=False):
|
||||
if (headphones.NZBMATRIX or headphones.NEWZNAB or headphones.NZBSORG or headphones.NEWZBIN) and (headphones.SAB_HOST or headphones.BLACKHOLE):
|
||||
foundNZB = searchNZB(albumid, new, lossless)
|
||||
|
||||
if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none":
|
||||
if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none":
|
||||
searchTorrent(albumid, new, lossless)
|
||||
|
||||
def searchNZB(albumid=None, new=False, losslessOnly=False):
|
||||
@@ -632,6 +635,13 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False):
|
||||
results = myDB.select('SELECT ArtistName, AlbumTitle, AlbumID, ReleaseDate from albums WHERE Status="Wanted" OR Status="Wanted Lossless"')
|
||||
new = True
|
||||
|
||||
# rutracker login
|
||||
|
||||
if headphones.RUTRACKER and results:
|
||||
rulogin = rutracker.login(headphones.RUTRACKER_USER, headphones.RUTRACKER_PASSWORD)
|
||||
if not rulogin:
|
||||
logger.info(u'Could not login to rutracker, search results will exclude this provider')
|
||||
|
||||
for albums in results:
|
||||
|
||||
albumid = albums[2]
|
||||
@@ -806,7 +816,54 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False):
|
||||
except Exception, e:
|
||||
logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s" % e)
|
||||
|
||||
|
||||
|
||||
# rutracker.org
|
||||
|
||||
if headphones.RUTRACKER and rulogin:
|
||||
|
||||
provider = "rutracker.org"
|
||||
|
||||
# Ignore if release date not specified, results too unpredictable
|
||||
|
||||
if not year:
|
||||
logger.info(u'Release date not specified, ignoring for rutracker.org')
|
||||
else:
|
||||
|
||||
bitrate = False
|
||||
|
||||
if headphones.PREFERRED_QUALITY == 3 or losslessOnly:
|
||||
format = 'lossless'
|
||||
maxsize = 10000000000
|
||||
elif headphones.PREFERRED_QUALITY == 1:
|
||||
format = 'lossless+mp3'
|
||||
maxsize = 10000000000
|
||||
else:
|
||||
format = 'mp3'
|
||||
maxsize = 300000000
|
||||
if headphones.PREFERRED_QUALITY == 2 and headphones.PREFERRED_BITRATE:
|
||||
bitrate = True
|
||||
|
||||
# build search url based on above
|
||||
|
||||
searchURL = rutracker.searchurl(artistterm, albumterm, year, format)
|
||||
logger.info(u'Parsing results from <a href="%s">rutracker.org</a>' % searchURL)
|
||||
|
||||
# parse results and get best match
|
||||
|
||||
rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid, bitrate)
|
||||
|
||||
# add best match to overall results list
|
||||
|
||||
if rulist:
|
||||
for ru in rulist:
|
||||
title = ru[0].decode('utf-8')
|
||||
size = ru[1]
|
||||
url = ru[2]
|
||||
resultlist.append((title, size, url, provider))
|
||||
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
|
||||
else:
|
||||
logger.info(u"No valid results found from %s" % (provider))
|
||||
|
||||
|
||||
if headphones.ISOHUNT:
|
||||
provider = "isoHunt"
|
||||
@@ -1029,19 +1086,24 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False):
|
||||
|
||||
# Get torrent name from .torrent, this is usually used by the torrent client as the folder name
|
||||
|
||||
|
||||
torrent_name = torrent_folder_name + '.torrent'
|
||||
download_path = os.path.join(headphones.TORRENTBLACKHOLE_DIR, torrent_name)
|
||||
try:
|
||||
#Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name.
|
||||
torrent_file = open(download_path, 'wb')
|
||||
torrent_file.write(data)
|
||||
torrent_file.close()
|
||||
#Open the fresh torrent file again so we can extract the proper torrent name
|
||||
#Used later in post-processing.
|
||||
torrent_file = open(download_path, 'rb')
|
||||
if bestqual[3] == 'rutracker.org':
|
||||
download_path = rutracker.get_torrent(bestqual[2], headphones.TORRENTBLACKHOLE_DIR)
|
||||
if not download_path:
|
||||
break
|
||||
else:
|
||||
#Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name.
|
||||
torrent_file = open(download_path, 'wb')
|
||||
torrent_file.write(data)
|
||||
torrent_file.close()
|
||||
|
||||
#Open the fresh torrent file again so we can extract the proper torrent name
|
||||
#Used later in post-processing.
|
||||
torrent_file = open(download_path, 'rb')
|
||||
torrent_info = bencode.bdecode(torrent_file.read())
|
||||
torrent_file.close()
|
||||
torrent_file.close()
|
||||
torrent_folder_name = torrent_info['info'].get('name','').decode('utf-8')
|
||||
logger.info('Torrent folder name: %s' % torrent_folder_name)
|
||||
except Exception, e:
|
||||
@@ -1058,7 +1120,12 @@ def preprocesstorrent(resultlist):
|
||||
selresult = result
|
||||
elif int(selresult[1]) < int(result[1]): # if size is lower than new result replace previous selected result (bigger size = better quality?)
|
||||
selresult = result
|
||||
|
||||
|
||||
# get outta here if rutracker
|
||||
|
||||
if selresult[3] == 'rutracker.org':
|
||||
return True, selresult
|
||||
|
||||
try:
|
||||
request = urllib2.Request(selresult[2])
|
||||
request.add_header('Accept-encoding', 'gzip')
|
||||
|
||||
287
headphones/searcher_rutracker.py
Normal file
287
headphones/searcher_rutracker.py
Normal file
@@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
|
||||
# Headphones rutracker.org search
|
||||
# Functions called from searcher.py
|
||||
|
||||
import urllib
|
||||
import urllib2
|
||||
import cookielib
|
||||
from urlparse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from headphones import logger, db
|
||||
import lib.bencode as bencode
|
||||
import os
|
||||
|
||||
class Rutracker():
|
||||
|
||||
logged_in = False
|
||||
# Stores a number of login attempts to prevent recursion.
|
||||
#login_counter = 0
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.cookiejar = cookielib.CookieJar()
|
||||
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar))
|
||||
urllib2.install_opener(self.opener)
|
||||
|
||||
def login(self, login, password):
|
||||
"""Implements tracker login procedure."""
|
||||
|
||||
self.logged_in = False
|
||||
|
||||
if login is None or password is None:
|
||||
return False
|
||||
|
||||
#self.login_counter += 1
|
||||
|
||||
# No recursion wanted.
|
||||
#if self.login_counter > 1:
|
||||
# return False
|
||||
|
||||
params = urllib.urlencode({"login_username" : login,
|
||||
"login_password" : password,
|
||||
"login" : "Вход"})
|
||||
|
||||
try:
|
||||
self.opener.open("http://login.rutracker.org/forum/login.php", params)
|
||||
except :
|
||||
pass
|
||||
|
||||
# Check if we're logged in
|
||||
|
||||
for cookie in self.cookiejar:
|
||||
if cookie.name == 'bb_data':
|
||||
self.logged_in = True
|
||||
|
||||
return self.logged_in
|
||||
|
||||
def searchurl(self, artist, album, year, format):
|
||||
"""
|
||||
Return the search url
|
||||
"""
|
||||
|
||||
# Build search url
|
||||
|
||||
searchterm = ''
|
||||
if artist != 'Various Artists':
|
||||
searchterm = artist
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + album
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + year
|
||||
|
||||
providerurl = "http://rutracker.org/forum/tracker.php"
|
||||
|
||||
if format == 'lossless':
|
||||
format = '+lossless'
|
||||
elif format == 'lossless+mp3':
|
||||
format = '+lossless||mp3||aac'
|
||||
else:
|
||||
format = '+mp3||aac'
|
||||
|
||||
# sort by size, descending.
|
||||
|
||||
sort = '&o=7&s=2'
|
||||
|
||||
searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort)
|
||||
|
||||
return searchurl
|
||||
|
||||
def search(self, searchurl, maxsize, minseeders, albumid, bitrate):
|
||||
"""
|
||||
Parse the search results and return the first valid torrent
|
||||
"""
|
||||
|
||||
titles = []
|
||||
urls = []
|
||||
seeders = []
|
||||
sizes = []
|
||||
torrentlist = []
|
||||
rulist = []
|
||||
|
||||
try:
|
||||
|
||||
page = self.opener.open(searchurl, timeout=60)
|
||||
soup = BeautifulSoup(page.read())
|
||||
|
||||
# Debug
|
||||
#logger.debug (soup.prettify())
|
||||
|
||||
# Title
|
||||
|
||||
for link in soup.find_all('a', attrs={'class' : 'med tLink bold'}):
|
||||
title = link.get_text()
|
||||
titles.append(title)
|
||||
|
||||
# Download URL
|
||||
|
||||
for link in soup.find_all('a', attrs={'class' : 'small tr-dl dl-stub'}):
|
||||
url = link.get('href')
|
||||
urls.append(url)
|
||||
|
||||
# Seeders
|
||||
|
||||
for link in soup.find_all('td', attrs={'class' : 'row4 seedmed'}):
|
||||
seeder = link.get_text()
|
||||
seeders.append(seeder)
|
||||
|
||||
# Size
|
||||
|
||||
for link in soup.find_all('td', attrs={'class' : 'row4 small nowrap tor-size'}):
|
||||
size = link.u.string
|
||||
sizes.append(size)
|
||||
|
||||
except :
|
||||
pass
|
||||
|
||||
# Combine lists
|
||||
|
||||
torrentlist = zip(titles, urls, seeders, sizes)
|
||||
|
||||
# return if nothing found
|
||||
|
||||
if not torrentlist:
|
||||
return False
|
||||
|
||||
# get headphones track count for album, return if not found
|
||||
|
||||
hptrackcount = 0
|
||||
|
||||
myDB = db.DBConnection()
|
||||
tracks = myDB.select('SELECT TrackTitle from tracks WHERE AlbumID=?', [albumid])
|
||||
for track in tracks:
|
||||
hptrackcount += 1
|
||||
|
||||
if not hptrackcount:
|
||||
logger.info('headphones track info not found, cannot compare to torrent')
|
||||
return False
|
||||
|
||||
# Return the first valid torrent, unless we want a preferred bitrate then we want all valid entries
|
||||
|
||||
for torrent in torrentlist:
|
||||
|
||||
returntitle = torrent[0].encode('utf-8')
|
||||
url = torrent[1]
|
||||
seeders = torrent[2]
|
||||
size = torrent[3]
|
||||
|
||||
# Attempt to filter out unwanted
|
||||
|
||||
title = returntitle.lower()
|
||||
|
||||
if 'promo' not in title and 'vinyl' not in title and 'songbook' not in title and 'tvrip' not in title and 'hdtv' not in title and 'dvd' not in title \
|
||||
and int(size) <= maxsize and int(seeders) >= minseeders:
|
||||
|
||||
# Check torrent info
|
||||
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
|
||||
|
||||
# Debug
|
||||
#for cookie in self.cookiejar:
|
||||
# logger.debug ('Cookie: %s' % cookie)
|
||||
|
||||
try:
|
||||
page = self.opener.open(url)
|
||||
torrent = page.read()
|
||||
if torrent:
|
||||
decoded = bencode.bdecode(torrent)
|
||||
metainfo = decoded['info']
|
||||
page.close ()
|
||||
except Exception, e:
|
||||
logger.error('Error getting torrent: %s' % e)
|
||||
return False
|
||||
|
||||
# get torrent track count and check for cue
|
||||
|
||||
trackcount = 0
|
||||
cuecount = 0
|
||||
|
||||
if 'files' in metainfo: # multi
|
||||
for pathfile in metainfo['files']:
|
||||
path = pathfile['path']
|
||||
for file in path:
|
||||
if '.ape' in file or '.flac' in file or '.ogg' in file or '.m4a' in file or '.aac' in file or '.mp3' in file or '.wav' in file or '.aif' in file:
|
||||
trackcount += 1
|
||||
if '.cue' in file:
|
||||
cuecount += 1
|
||||
|
||||
#Torrent topic page
|
||||
|
||||
topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
|
||||
logger.debug ('torrent title: %s' % title)
|
||||
logger.debug ('headphones trackcount: %s' % hptrackcount)
|
||||
logger.debug ('rutracker trackcount: %s' % trackcount)
|
||||
|
||||
# If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s)
|
||||
# This is for the case where we have a single .flac/.wav which can be split by cue
|
||||
# Not great, but shouldn't be doing this too often
|
||||
|
||||
totallogcount = 0
|
||||
if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount:
|
||||
page = self.opener.open(topicurl, timeout=60)
|
||||
soup = BeautifulSoup(page.read())
|
||||
findtoc = soup.find_all(text='TOC of the extracted CD')
|
||||
if not findtoc:
|
||||
findtoc = soup.find_all(text='TOC извлечённого CD')
|
||||
for toc in findtoc:
|
||||
logcount = 0
|
||||
for toccontent in toc.find_all_next(text=True):
|
||||
cut_string = toccontent.split('|')
|
||||
new_string = cut_string[0].lstrip().rstrip()
|
||||
if new_string == '1' or new_string == '01':
|
||||
logcount = 1
|
||||
elif logcount > 0:
|
||||
if new_string.isdigit():
|
||||
logcount += 1
|
||||
else:
|
||||
break
|
||||
totallogcount = totallogcount + logcount
|
||||
|
||||
if totallogcount > 0:
|
||||
trackcount = totallogcount
|
||||
logger.debug ('rutracker logtrackcount: %s' % totallogcount)
|
||||
|
||||
# If torrent track count = hp track count then return torrent,
|
||||
# if greater, check for deluxe/special/foreign editions
|
||||
# if less, then allow if it's a single track with a cue
|
||||
|
||||
valid = False
|
||||
|
||||
if trackcount == hptrackcount:
|
||||
valid = True
|
||||
elif trackcount > hptrackcount:
|
||||
if 'deluxe' in title or 'edition' in title or 'japanese' in title:
|
||||
valid = True
|
||||
|
||||
# return 1st valid torrent if not checking by bitrate, else add to list and return at end
|
||||
|
||||
if valid:
|
||||
rulist.append((returntitle, size, topicurl))
|
||||
if not bitrate:
|
||||
return rulist
|
||||
|
||||
return rulist
|
||||
|
||||
|
||||
def get_torrent(self, url, savelocation):
|
||||
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
|
||||
downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
|
||||
torrent_name = torrent_id + '.torrent'
|
||||
download_path = os.path.join(savelocation, torrent_name)
|
||||
|
||||
try:
|
||||
page = self.opener.open(downloadurl)
|
||||
torrent = page.read()
|
||||
fp = open (download_path, 'wb')
|
||||
fp.write (torrent)
|
||||
fp.close ()
|
||||
except Exception, e:
|
||||
logger.error('Error getting torrent: %s' % e)
|
||||
return False
|
||||
|
||||
return download_path
|
||||
|
||||
@@ -20,7 +20,7 @@ from headphones import logger, version
|
||||
|
||||
import lib.simplejson as simplejson
|
||||
|
||||
user = "rembo10"
|
||||
user = "AdeHub"
|
||||
branch = "master"
|
||||
|
||||
def runGit(args):
|
||||
|
||||
@@ -463,6 +463,9 @@ class WebInterface(object):
|
||||
"use_waffles" : checked(headphones.WAFFLES),
|
||||
"waffles_uid" : headphones.WAFFLES_UID,
|
||||
"waffles_passkey": headphones.WAFFLES_PASSKEY,
|
||||
"use_rutracker" : checked(headphones.RUTRACKER),
|
||||
"rutracker_user" : headphones.RUTRACKER_USER,
|
||||
"rutracker_password": headphones.RUTRACKER_PASSWORD,
|
||||
"pref_qual_0" : radio(headphones.PREFERRED_QUALITY, 0),
|
||||
"pref_qual_1" : radio(headphones.PREFERRED_QUALITY, 1),
|
||||
"pref_qual_3" : radio(headphones.PREFERRED_QUALITY, 3),
|
||||
@@ -545,7 +548,7 @@ class WebInterface(object):
|
||||
sab_category=None, download_dir=None, blackhole=0, blackhole_dir=None, usenet_retention=None, nzbmatrix=0, nzbmatrix_username=None, nzbmatrix_apikey=None,
|
||||
newznab=0, newznab_host=None, newznab_apikey=None, newznab_enabled=0, nzbsorg=0, nzbsorg_uid=None, nzbsorg_hash=None, newzbin=0, newzbin_uid=None,
|
||||
newzbin_password=None, preferred_quality=0, preferred_bitrate=None, detect_bitrate=0, move_files=0, torrentblackhole_dir=None, download_torrent_dir=None,
|
||||
numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rename_files=0, correct_metadata=0,
|
||||
numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rutracker=0, rutracker_user=None, rutracker_password=None, rename_files=0, correct_metadata=0,
|
||||
cleanup_files=0, add_album_art=0, embed_album_art=0, embed_lyrics=0, destination_dir=None, lossless_destination_dir=None, folder_format=None, file_format=None,
|
||||
include_extras=0, single=0, ep=0, compilation=0, soundtrack=0, live=0, remix=0, spokenword=0, audiobook=0, autowant_upcoming=False, autowant_all=False,
|
||||
interface=None, log_dir=None, music_encoder=0, encoder=None, bitrate=None, samplingfrequency=None, encoderfolder=None, advancedencoder=None,
|
||||
@@ -595,6 +598,9 @@ class WebInterface(object):
|
||||
headphones.WAFFLES = waffles
|
||||
headphones.WAFFLES_UID = waffles_uid
|
||||
headphones.WAFFLES_PASSKEY = waffles_passkey
|
||||
headphones.RUTRACKER = rutracker
|
||||
headphones.RUTRACKER_USER = rutracker_user
|
||||
headphones.RUTRACKER_PASSWORD = rutracker_password
|
||||
headphones.PREFERRED_QUALITY = int(preferred_quality)
|
||||
headphones.PREFERRED_BITRATE = preferred_bitrate
|
||||
headphones.PREFERRED_BITRATE_HIGH_BUFFER = preferred_bitrate_high_buffer
|
||||
|
||||
17
html5lib/__init__.py
Normal file
17
html5lib/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
HTML parsing library based on the WHATWG "HTML5"
|
||||
specification. The parser is designed to be compatible with existing
|
||||
HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage:
|
||||
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
tree = html5lib.parse(f)
|
||||
"""
|
||||
__version__ = "0.95-dev"
|
||||
from html5parser import HTMLParser, parse, parseFragment
|
||||
from treebuilders import getTreeBuilder
|
||||
from treewalkers import getTreeWalker
|
||||
from serializer import serialize
|
||||
3085
html5lib/constants.py
Normal file
3085
html5lib/constants.py
Normal file
File diff suppressed because it is too large
Load Diff
0
html5lib/filters/__init__.py
Normal file
0
html5lib/filters/__init__.py
Normal file
10
html5lib/filters/_base.py
Normal file
10
html5lib/filters/_base.py
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
||||
127
html5lib/filters/formfiller.py
Normal file
127
html5lib/filters/formfiller.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#
|
||||
# The goal is to finally have a form filler where you pass data for
|
||||
# each form, using the algorithm for "Seeding a form with initial values"
|
||||
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
|
||||
#
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class SimpleFilter(_base.Filter):
|
||||
def __init__(self, source, fieldStorage):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.fieldStorage = fieldStorage
|
||||
|
||||
def __iter__(self):
|
||||
field_indices = {}
|
||||
state = None
|
||||
field_name = None
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"].lower()
|
||||
if name == "input":
|
||||
field_name = None
|
||||
field_type = None
|
||||
input_value_index = -1
|
||||
input_checked_index = -1
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == u"name":
|
||||
field_name = v.strip(spaceCharacters)
|
||||
elif n == u"type":
|
||||
field_type = v.strip(spaceCharacters)
|
||||
elif n == u"checked":
|
||||
input_checked_index = i
|
||||
elif n == u"value":
|
||||
input_value_index = i
|
||||
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
|
||||
if field_type in (u"checkbox", u"radio"):
|
||||
if value_list:
|
||||
if token["data"][input_value_index][1] == value:
|
||||
if input_checked_index < 0:
|
||||
token["data"].append((u"checked", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
elif input_checked_index >= 0:
|
||||
del token["data"][input_checked_index]
|
||||
|
||||
elif field_type not in (u"button", u"submit", u"reset"):
|
||||
if input_value_index >= 0:
|
||||
token["data"][input_value_index] = (u"value", value)
|
||||
else:
|
||||
token["data"].append((u"value", value))
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_type = None
|
||||
field_name = None
|
||||
|
||||
elif name == "textarea":
|
||||
field_type = "textarea"
|
||||
field_name = dict((token["data"])[::-1])["name"]
|
||||
|
||||
elif name == "select":
|
||||
field_type = "select"
|
||||
attributes = dict(token["data"][::-1])
|
||||
field_name = attributes.get("name")
|
||||
is_select_multiple = "multiple" in attributes
|
||||
is_selected_option_found = False
|
||||
|
||||
elif field_type == "select" and field_name and name == "option":
|
||||
option_selected_index = -1
|
||||
option_value = None
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == "selected":
|
||||
option_selected_index = i
|
||||
elif n == "value":
|
||||
option_value = v.strip(spaceCharacters)
|
||||
if option_value is None:
|
||||
raise NotImplementedError("<option>s without a value= attribute")
|
||||
else:
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
if (is_select_multiple or not is_selected_option_found) and option_value == value:
|
||||
if option_selected_index < 0:
|
||||
token["data"].append((u"selected", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
is_selected_option_found = True
|
||||
elif option_selected_index >= 0:
|
||||
del token["data"][option_selected_index]
|
||||
|
||||
elif field_type is not None and field_name and type == "EndTag":
|
||||
name = token["name"].lower()
|
||||
if name == field_type:
|
||||
if name == "textarea":
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
yield {"type": "Characters", "data": value}
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_name = None
|
||||
|
||||
elif name == "option" and field_type == "select":
|
||||
pass # TODO: part of "option without value= attribute" processing
|
||||
|
||||
elif field_type == "textarea":
|
||||
continue # ignore token
|
||||
|
||||
yield token
|
||||
62
html5lib/filters/inject_meta_charset.py
Normal file
62
html5lib/filters/inject_meta_charset.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __init__(self, source, encoding):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == u"head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == u"meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
for (namespace,name),value in token["data"].iteritems():
|
||||
if namespace != None:
|
||||
continue
|
||||
elif name.lower() == u'charset':
|
||||
token["data"][(namespace,name)] = self.encoding
|
||||
meta_found = True
|
||||
break
|
||||
elif name == u'http-equiv' and value.lower() == u'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
else:
|
||||
if has_http_equiv_content_type and (None, u"content") in token["data"]:
|
||||
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == u"head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": u"head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": u"meta",
|
||||
"data": {(None, u"charset"): self.encoding}}
|
||||
yield {"type": "EndTag", "name": u"head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == u"head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": u"meta",
|
||||
"data": {(None, u"charset"): self.encoding}}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
||||
88
html5lib/filters/lint.py
Normal file
88
html5lib/filters/lint.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class LintError(Exception): pass
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
contentModelFlag = "PCDATA"
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty attribute name"))
|
||||
if not isinstance(value, unicode):
|
||||
raise LintError(_("Attribute value is not a string: %r") % value)
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
contentModelFlag = "RCDATA"
|
||||
elif name == "plaintext":
|
||||
contentModelFlag = "PLAINTEXT"
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if name in voidElements:
|
||||
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
||||
contentModelFlag = "PCDATA"
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % data)
|
||||
if not data:
|
||||
raise LintError(_(u"%s token with empty data") % type)
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise LintError(_(u"Unknown token type: %s") % type)
|
||||
|
||||
yield token
|
||||
202
html5lib/filters/optionaltags.py
Normal file
202
html5lib/filters/optionaltags.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if (token["data"] or
|
||||
not self.is_optional_start(token["name"], previous, next)):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
# XXX: we also omit the start tag if the head element is empty
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return True
|
||||
elif type == "EndTag":
|
||||
return next["name"] == "head"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody','thead','tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, article, aside,
|
||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||
# there is no more content in the parent element.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return next["name"] in ('address', 'article', 'aside',
|
||||
'blockquote', 'datagrid', 'dialog',
|
||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'header', 'hr', 'menu', 'nav', 'ol',
|
||||
'p', 'pre', 'section', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'option':
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if it is immediately followed by an <code>optgroup</code>
|
||||
# element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('option', 'optgroup')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('rt', 'rp'):
|
||||
# An rt element's end tag may be omitted if the rt element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
# An rp element's end tag may be omitted if the rp element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('rt', 'rp')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
||||
8
html5lib/filters/sanitizer.py
Normal file
8
html5lib/filters/sanitizer.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import _base
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
|
||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||
def __iter__(self):
|
||||
for token in _base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token: yield token
|
||||
41
html5lib/filters/whitespace.py
Normal file
41
html5lib/filters/whitespace.py
Normal file
@@ -0,0 +1,41 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
||||
|
||||
class Filter(_base.Filter):
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = u" "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
def collapse_spaces(text):
|
||||
return SPACES_REGEX.sub(' ', text)
|
||||
|
||||
2733
html5lib/html5parser.py
Normal file
2733
html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load Diff
177
html5lib/ihatexml.py
Normal file
177
html5lib/ihatexml.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import re
|
||||
|
||||
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
|
||||
|
||||
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
#Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1]*2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i+j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
#We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1]+1, charList[i+1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(escapeRegexp(unichr(item[0])))
|
||||
else:
|
||||
rv.append(escapeRegexp(unichr(item[0])) + "-" +
|
||||
escapeRegexp(unichr(item[1])))
|
||||
return "[%s]"%"".join(rv)
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, "\\" + char)
|
||||
if char in string:
|
||||
print string
|
||||
|
||||
return string
|
||||
|
||||
#output from the above
|
||||
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
def __init__(self, replaceChars = None,
|
||||
dropXmlnsLocalName = False,
|
||||
dropXmlnsAttrNs = False,
|
||||
preventDoubleDashComments = False,
|
||||
preventDashAtCommentEnd = False,
|
||||
replaceFormFeedCharacters = True):
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
#Need a datalosswarning here
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name, namespace=None):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
data = data.replace("--", "- -")
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
data = data.replace("\x0C", " ")
|
||||
#Other non-xml characters
|
||||
return data
|
||||
|
||||
def toXmlName(self, name):
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return unichr(int(charcode[1:], 16))
|
||||
782
html5lib/inputstream.py
Normal file
782
html5lib/inputstream.py
Normal file
@@ -0,0 +1,782 @@
|
||||
import codecs
|
||||
import re
|
||||
import types
|
||||
import sys
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings, ReparseException
|
||||
import utils
|
||||
|
||||
#Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
|
||||
|
||||
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF])
|
||||
|
||||
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
class BufferedStream:
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1,0] #chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos < self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= pos
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
data = rv.append(bufferedData[bufferOffset:
|
||||
bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
|
||||
class HTMLInputStream:
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
|
||||
#Craziness
|
||||
if len(u"\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = (codecName(encoding), "certain")
|
||||
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
self.numBytesMeta = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if (self.charEncoding[0] is None):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||
'replace')
|
||||
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
#Deal with CR LF and surrogates split over chunk boundaries
|
||||
self._bufferedCharacter = None
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
self.charEncoding = ("utf-8", "certain")
|
||||
try:
|
||||
from io import BytesIO
|
||||
except:
|
||||
# 2to3 converts this line to: from io import StringIO
|
||||
from cStringIO import StringIO as BytesIO
|
||||
stream = BytesIO(source)
|
||||
|
||||
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
|
||||
stream is sys.stdin):
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
confidence = "certain"
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding is None and parseMeta:
|
||||
encoding = self.detectEncodingMeta()
|
||||
confidence = "tentative"
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and chardet:
|
||||
confidence = "tentative"
|
||||
try:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
self.rawStream.seek(0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
if encoding is None:
|
||||
confidence="tentative"
|
||||
encoding = self.defaultEncoding
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
encodingSub = {"iso-8859-1":"windows-1252"}
|
||||
|
||||
if encoding.lower() in encodingSub:
|
||||
encoding = encodingSub[encoding.lower()]
|
||||
|
||||
return encoding, confidence
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
newEncoding = codecName(newEncoding)
|
||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
newEncoding = "utf-8"
|
||||
if newEncoding is None:
|
||||
return
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.reset()
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
parser = EncodingParser(buffer)
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
encoding = "utf-8"
|
||||
|
||||
return encoding
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count(u'\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind(u'\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line+1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
#Deal with CR LF and surrogates broken across chunks
|
||||
if self._bufferedCharacter:
|
||||
data = self._bufferedCharacter + data
|
||||
self._bufferedCharacter = None
|
||||
elif not data:
|
||||
# We have no more data, bye-bye stream
|
||||
return False
|
||||
|
||||
if len(data) > 1:
|
||||
lastv = ord(data[-1])
|
||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
|
||||
|
||||
data = data.replace(u"\r\n", u"\n")
|
||||
data = data.replace(u"\r", u"\n")
|
||||
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for i in xrange(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
#Someone picked the wrong compile option
|
||||
#You lose
|
||||
skip = False
|
||||
import sys
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
#Pretty sure there should be endianness issues here
|
||||
if utils.isSurrogatePair(data[pos:pos+2]):
|
||||
#We have a surrogate pair!
|
||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = u"^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
else:
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = u"".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
if char is not None:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
return str.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
self._position=-1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p]
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
p = self.position
|
||||
data = self[p:p+len(bytes)]
|
||||
rv = data.startswith(bytes)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
newPosition = self[self.position:].find(bytes)
|
||||
if newPosition > -1:
|
||||
# XXX: This is ugly, but I can't see a nicer way to fix this.
|
||||
if self._position == -1:
|
||||
self._position = 0
|
||||
self._position += (newPosition + len(bytes)-1)
|
||||
return True
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
methodDispatch = (
|
||||
("<!--",self.handleComment),
|
||||
("<meta",self.handleMeta),
|
||||
("</",self.handlePossibleEndTag),
|
||||
("<!",self.handleOther),
|
||||
("<?",self.handleOther),
|
||||
("<",self.handlePossibleStartTag))
|
||||
for byte in self.data:
|
||||
keepParsing = True
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing=False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo("-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
#if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
#We have a valid meta element we want to search for attributes
|
||||
while True:
|
||||
#Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == "charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == "content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
self.data.next()
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag:
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == "<":
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
data.previous()
|
||||
else:
|
||||
#Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
||||
# Step 2
|
||||
if c in (">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
#Step 4 attribute name
|
||||
while True:
|
||||
if c == "=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
#Step 6!
|
||||
c = data.skip()
|
||||
c = data.next()
|
||||
break
|
||||
elif c in ("/", ">"):
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c == None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
#Step 5
|
||||
c = data.next()
|
||||
#Step 7
|
||||
if c != "=":
|
||||
data.previous()
|
||||
return "".join(attrName), ""
|
||||
#Step 8
|
||||
data.next()
|
||||
#Step 9
|
||||
c = data.skip()
|
||||
#Step 10
|
||||
if c in ("'", '"'):
|
||||
#10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
#10.2
|
||||
c = data.next()
|
||||
#10.3
|
||||
if c == quoteChar:
|
||||
data.next()
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
#10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
#10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c == ">":
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = data.next()
|
||||
if c in spacesAngleBrackets:
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
def parse(self):
|
||||
try:
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
self.data.jumpTo("charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == "=":
|
||||
#If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
#Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in ('"', "'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
#Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
#Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def codecName(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if (encoding is not None and type(encoding) in types.StringTypes):
|
||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||
return encodings.get(canonicalName, None)
|
||||
else:
|
||||
return None
|
||||
258
html5lib/sanitizer.py
Normal file
258
html5lib/sanitizer.py
Normal file
@@ -0,0 +1,258 @@
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from tokenizer import HTMLTokenizer
|
||||
from constants import tokenTypes
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
||||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
||||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
||||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
||||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
||||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
||||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
||||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
||||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
||||
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
||||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
||||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
||||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
||||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
||||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
||||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
||||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
||||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
||||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
||||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
||||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
||||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
||||
'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
|
||||
'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
||||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
||||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
||||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
||||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
||||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
||||
'width', 'wrap', 'xml:lang']
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
||||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
||||
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
||||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
||||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
||||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
||||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
||||
'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
||||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
||||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
||||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
||||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||
'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||
'xlink:href', 'xml:base']
|
||||
|
||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||
'mask', 'stroke']
|
||||
|
||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
||||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
||||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
||||
'set', 'use']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||
allowed_css_properties = acceptable_css_properties
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in tokenTypes.keys():
|
||||
token_type = tokenTypes[token_type]
|
||||
|
||||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]):
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in
|
||||
token["data"][::-1]
|
||||
if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr):
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
#remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace(u"\ufffd", "")
|
||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
|
||||
(val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||
attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
return token
|
||||
else:
|
||||
if token_type == tokenTypes["EndTag"]:
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
|
||||
if token["type"] in tokenTypes.keys():
|
||||
token["type"] = "Characters"
|
||||
else:
|
||||
token["type"] = tokenTypes["Characters"]
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
elif token_type == tokenTypes["Comment"]:
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
|
||||
|
||||
clean = []
|
||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||
if not value: continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
|
||||
#Change case matching defaults as we only output lowercase html anyway
|
||||
#This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName, parser=parser)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
17
html5lib/serializer/__init__.py
Normal file
17
html5lib/serializer/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
from html5lib import treewalkers
|
||||
|
||||
from htmlserializer import HTMLSerializer
|
||||
from xhtmlserializer import XHTMLSerializer
|
||||
|
||||
def serialize(input, tree="simpletree", format="html", encoding=None,
|
||||
**serializer_opts):
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
if format == "html":
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
elif format == "xhtml":
|
||||
s = XHTMLSerializer(**serializer_opts)
|
||||
else:
|
||||
raise ValueError, "type must be either html or xhtml"
|
||||
return s.render(walker(input), encoding)
|
||||
312
html5lib/serializer/htmlserializer.py
Normal file
312
html5lib/serializer/htmlserializer.py
Normal file
@@ -0,0 +1,312 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements, entities, xmlEntities
|
||||
from html5lib import utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
try:
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
except ImportError:
|
||||
unicode_encode_errors = "strict"
|
||||
else:
|
||||
unicode_encode_errors = "htmlentityreplace"
|
||||
|
||||
from html5lib.constants import entities
|
||||
|
||||
encode_entity_map = {}
|
||||
is_ucs4 = len(u"\U0010FFFF") == 1
|
||||
for k, v in entities.items():
|
||||
#skip multi-character entities
|
||||
if ((is_ucs4 and len(v) > 1) or
|
||||
(not is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
try:
|
||||
v = ord(v)
|
||||
except:
|
||||
print v
|
||||
raise
|
||||
if not v in encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
|
||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;"%(hex(cp)[2:]))
|
||||
return (u"".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||
|
||||
del register_error
|
||||
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = False
|
||||
quote_char = u'"'
|
||||
use_best_quote_char = True
|
||||
|
||||
# tag syntax options
|
||||
omit_optional_tags = True
|
||||
minimize_boolean_attributes = True
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
|
||||
# escaping options
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
resolve_entities = True
|
||||
|
||||
# miscellaneous options
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata", "resolve_entities", "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize HTMLSerializer.
|
||||
|
||||
Keyword options (default given first unless specified) include:
|
||||
|
||||
inject_meta_charset=True|False
|
||||
Whether it insert a meta element to define the character set of the
|
||||
document.
|
||||
quote_attr_values=True|False
|
||||
Whether to quote attribute values that don't require quoting
|
||||
per HTML5 parsing rules.
|
||||
quote_char=u'"'|u"'"
|
||||
Use given quote character for attribute quoting. Default is to
|
||||
use double quote unless attribute value contains a double quote,
|
||||
in which case single quotes are used instead.
|
||||
escape_lt_in_attrs=False|True
|
||||
Whether to escape < in attribute values.
|
||||
escape_rcdata=False|True
|
||||
Whether to escape characters that need to be escaped within normal
|
||||
elements within rcdata elements such as style.
|
||||
resolve_entities=True|False
|
||||
Whether to resolve named character entities that appear in the
|
||||
source tree. The XML predefined entities < > & " '
|
||||
are unaffected by this setting.
|
||||
strip_whitespace=False|True
|
||||
Whether to remove semantically meaningless whitespace. (This
|
||||
compresses all whitespace to a single space except within pre.)
|
||||
minimize_boolean_attributes=True|False
|
||||
Shortens boolean attributes to give just the attribute value,
|
||||
for example <input disabled="disabled"> becomes <input disabled>.
|
||||
use_trailing_solidus=False|True
|
||||
Includes a close-tag slash at the end of the start tag of void
|
||||
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
||||
space_before_trailing_solidus=True|False
|
||||
Places a space immediately before the closing slash in a tag
|
||||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
||||
sanitize=False|True
|
||||
Strip all unsafe or unknown constructs from output.
|
||||
See `html5lib user documentation`_
|
||||
omit_optional_tags=True|False
|
||||
Omit start/end tags that are optional.
|
||||
|
||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
||||
"""
|
||||
if kwargs.has_key('quote_char'):
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def encode(self, string):
|
||||
assert(isinstance(string, unicode))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, unicode_encode_errors)
|
||||
else:
|
||||
return string
|
||||
|
||||
def encodeStrict(self, string):
|
||||
assert(isinstance(string, unicode))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "strict")
|
||||
else:
|
||||
return string
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
self.encoding = encoding
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
if encoding and self.inject_meta_charset:
|
||||
from html5lib.filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from html5lib.filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from html5lib.filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from html5lib.filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = u"<!DOCTYPE %s" % token["name"]
|
||||
|
||||
if token["publicId"]:
|
||||
doctype += u' PUBLIC "%s"' % token["publicId"]
|
||||
elif token["systemId"]:
|
||||
doctype += u" SYSTEM"
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find(u'"') >= 0:
|
||||
if token["systemId"].find(u"'") >= 0:
|
||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
||||
quote_char = u"'"
|
||||
else:
|
||||
quote_char = u'"'
|
||||
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||
|
||||
doctype += u">"
|
||||
yield self.encodeStrict(doctype)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError(_("Unexpected </ in CDATA"))
|
||||
yield self.encode(token["data"])
|
||||
else:
|
||||
yield self.encode(escape(token["data"]))
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
yield self.encodeStrict(u"<%s" % name)
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
attributes = []
|
||||
for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
|
||||
#TODO: Add namespace support here
|
||||
k = attr_name
|
||||
v = attr_value
|
||||
yield self.encodeStrict(u' ')
|
||||
|
||||
yield self.encodeStrict(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) \
|
||||
and k not in booleanAttributes.get("", tuple())):
|
||||
yield self.encodeStrict(u"=")
|
||||
if self.quote_attr_values or not v:
|
||||
quote_attr = True
|
||||
else:
|
||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||
spaceCharacters + u">\"'=", False)
|
||||
v = v.replace(u"&", u"&")
|
||||
if self.escape_lt_in_attrs: v = v.replace(u"<", u"<")
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if u"'" in v and u'"' not in v:
|
||||
quote_char = u'"'
|
||||
elif u'"' in v and u"'" not in v:
|
||||
quote_char = u"'"
|
||||
if quote_char == u"'":
|
||||
v = v.replace(u"'", u"'")
|
||||
else:
|
||||
v = v.replace(u'"', u""")
|
||||
yield self.encodeStrict(quote_char)
|
||||
yield self.encode(v)
|
||||
yield self.encodeStrict(quote_char)
|
||||
else:
|
||||
yield self.encode(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
yield self.encodeStrict(u" /")
|
||||
else:
|
||||
yield self.encodeStrict(u"/")
|
||||
yield self.encode(u">")
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
yield self.encodeStrict(u"</%s>" % name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError(_("Comment contains --"))
|
||||
yield self.encodeStrict(u"<!--%s-->" % token["data"])
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if not key in entities:
|
||||
self.serializeError(_("Entity %s not recognized" % name))
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
data = u"&%s;" % name
|
||||
yield self.encodeStrict(data)
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
if encoding:
|
||||
return "".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return u"".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
def SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
||||
9
html5lib/serializer/xhtmlserializer.py
Normal file
9
html5lib/serializer/xhtmlserializer.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from htmlserializer import HTMLSerializer
|
||||
|
||||
class XHTMLSerializer(HTMLSerializer):
|
||||
quote_attr_values = True
|
||||
minimize_boolean_attributes = False
|
||||
use_trailing_solidus = True
|
||||
escape_lt_in_attrs = True
|
||||
omit_optional_tags = False
|
||||
escape_rcdata = True
|
||||
12
html5lib/tests/__init__.py
Normal file
12
html5lib/tests/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
parent_path = os.path.abspath(os.path.join(os.path.split(__file__)[0], ".."))
|
||||
|
||||
if not parent_path in sys.path:
|
||||
sys.path.insert(0, parent_path)
|
||||
del parent_path
|
||||
|
||||
from runtests import buildTestSuite
|
||||
|
||||
import support
|
||||
37
html5lib/tests/mockParser.py
Normal file
37
html5lib/tests/mockParser.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
if __name__ == '__main__':
|
||||
#Allow us to import from the src directory
|
||||
os.chdir(os.path.split(os.path.abspath(__file__))[0])
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
|
||||
|
||||
from tokenizer import HTMLTokenizer
|
||||
|
||||
class HTMLParser(object):
|
||||
""" Fake parser to test tokenizer output """
|
||||
def parse(self, stream, output=True):
|
||||
tokenizer = HTMLTokenizer(stream)
|
||||
for token in tokenizer:
|
||||
if output:
|
||||
print token
|
||||
|
||||
if __name__ == "__main__":
|
||||
x = HTMLParser()
|
||||
if len(sys.argv) > 1:
|
||||
if len(sys.argv) > 2:
|
||||
import hotshot, hotshot.stats
|
||||
prof = hotshot.Profile('stats.prof')
|
||||
prof.runcall(x.parse, sys.argv[1], False)
|
||||
prof.close()
|
||||
stats = hotshot.stats.load('stats.prof')
|
||||
stats.strip_dirs()
|
||||
stats.sort_stats('time')
|
||||
stats.print_stats()
|
||||
else:
|
||||
x.parse(sys.argv[1])
|
||||
else:
|
||||
print """Usage: python mockParser.py filename [stats]
|
||||
If stats is specified the hotshots profiler will run and output the
|
||||
stats instead.
|
||||
"""
|
||||
27
html5lib/tests/runparsertests.py
Normal file
27
html5lib/tests/runparsertests.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
import unittest
|
||||
|
||||
#Allow us to import the parent module
|
||||
os.chdir(os.path.split(os.path.abspath(__file__))[0])
|
||||
sys.path.insert(0, os.path.abspath(os.curdir))
|
||||
sys.path.insert(0, os.path.abspath(os.pardir))
|
||||
sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
|
||||
|
||||
def buildTestSuite():
|
||||
suite = unittest.TestSuite()
|
||||
for testcase in glob.glob('test_*.py'):
|
||||
if testcase in ("test_tokenizer.py", "test_parser.py", "test_parser2.py"):
|
||||
module = os.path.splitext(testcase)[0]
|
||||
suite.addTest(__import__(module).buildTestSuite())
|
||||
return suite
|
||||
|
||||
def main():
|
||||
results = unittest.TextTestRunner().run(buildTestSuite())
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
results = main()
|
||||
if not results.wasSuccessful():
|
||||
sys.exit(1)
|
||||
20
html5lib/tests/runtests.py
Normal file
20
html5lib/tests/runtests.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
import unittest
|
||||
|
||||
def buildTestSuite():
|
||||
suite = unittest.TestSuite()
|
||||
for testcase in glob.glob('test_*.py'):
|
||||
module = os.path.splitext(testcase)[0]
|
||||
suite.addTest(__import__(module).buildTestSuite())
|
||||
return suite
|
||||
|
||||
def main():
|
||||
results = unittest.TextTestRunner().run(buildTestSuite())
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
results = main()
|
||||
if not results.wasSuccessful():
|
||||
sys.exit(1)
|
||||
127
html5lib/tests/support.py
Normal file
127
html5lib/tests/support.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
import glob
|
||||
|
||||
base_path = os.path.split(__file__)[0]
|
||||
|
||||
if os.path.exists(os.path.join(base_path, 'testdata')):
|
||||
#release
|
||||
test_dir = os.path.join(base_path, 'testdata')
|
||||
else:
|
||||
#development
|
||||
test_dir = os.path.abspath(
|
||||
os.path.join(base_path,
|
||||
os.path.pardir, os.path.pardir,
|
||||
os.path.pardir, 'testdata'))
|
||||
assert os.path.exists(test_dir), "Test data not found"
|
||||
#import the development html5lib
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(base_path,
|
||||
os.path.pardir,
|
||||
os.path.pardir)))
|
||||
|
||||
import html5lib
|
||||
from html5lib import html5parser, treebuilders
|
||||
del base_path
|
||||
|
||||
#Build a dict of avaliable trees
|
||||
treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
|
||||
"DOM":treebuilders.getTreeBuilder("dom")}
|
||||
|
||||
#Try whatever etree implementations are avaliable from a list that are
|
||||
#"supposed" to work
|
||||
try:
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
try:
|
||||
import elementtree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import xml.etree.cElementTree as cElementTree
|
||||
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
try:
|
||||
import cElementTree
|
||||
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import lxml.etree as lxml
|
||||
treeTypes['lxml'] = treebuilders.getTreeBuilder("etree", lxml, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import BeautifulSoup
|
||||
treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
def html5lib_test_files(subdirectory, files='*.dat'):
|
||||
return glob.glob(os.path.join(test_dir,subdirectory,files))
|
||||
|
||||
class DefaultDict(dict):
|
||||
def __init__(self, default, *args, **kwargs):
|
||||
self.default = default
|
||||
dict.__init__(self, *args, **kwargs)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
class TestData(object):
|
||||
def __init__(self, filename, newTestHeading="data"):
|
||||
self.f = codecs.open(filename, encoding="utf8")
|
||||
self.newTestHeading = newTestHeading
|
||||
|
||||
def __iter__(self):
|
||||
data = DefaultDict(None)
|
||||
key=None
|
||||
for line in self.f:
|
||||
heading = self.isSectionHeading(line)
|
||||
if heading:
|
||||
if data and heading == self.newTestHeading:
|
||||
#Remove trailing newline
|
||||
data[key] = data[key][:-1]
|
||||
yield self.normaliseOutput(data)
|
||||
data = DefaultDict(None)
|
||||
key = heading
|
||||
data[key]=""
|
||||
elif key is not None:
|
||||
data[key] += line
|
||||
if data:
|
||||
yield self.normaliseOutput(data)
|
||||
|
||||
def isSectionHeading(self, line):
|
||||
"""If the current heading is a test section heading return the heading,
|
||||
otherwise return False"""
|
||||
if line.startswith("#"):
|
||||
return line[1:].strip()
|
||||
else:
|
||||
return False
|
||||
|
||||
def normaliseOutput(self, data):
|
||||
#Remove trailing newlines
|
||||
for key,value in data.iteritems():
|
||||
if value.endswith("\n"):
|
||||
data[key] = value[:-1]
|
||||
return data
|
||||
|
||||
def convert(stripChars):
|
||||
def convertData(data):
|
||||
"""convert the output of str(document) to the format used in the testcases"""
|
||||
data = data.split("\n")
|
||||
rv = []
|
||||
for line in data:
|
||||
if line.startswith("|"):
|
||||
rv.append(line[stripChars:])
|
||||
else:
|
||||
rv.append(line)
|
||||
return "\n".join(rv)
|
||||
return convertData
|
||||
|
||||
convertExpected = convert(2)
|
||||
54
html5lib/tests/test_encoding.py
Normal file
54
html5lib/tests/test_encoding.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import os
|
||||
import unittest
|
||||
from support import html5lib_test_files, TestData, test_dir
|
||||
|
||||
from html5lib import HTMLParser, inputstream
|
||||
|
||||
import re, unittest
|
||||
|
||||
class Html5EncodingTestCase(unittest.TestCase):
|
||||
def test_codec_name(self):
|
||||
self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
|
||||
self.assertEquals(inputstream.codecName("utf8"), "utf-8")
|
||||
self.assertEquals(inputstream.codecName(" utf8 "), "utf-8")
|
||||
self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
|
||||
|
||||
def buildTestSuite():
|
||||
for filename in html5lib_test_files("encoding"):
|
||||
test_name = os.path.basename(filename).replace('.dat',''). \
|
||||
replace('-','')
|
||||
tests = TestData(filename, "data")
|
||||
for idx, test in enumerate(tests):
|
||||
def encodingTest(self, data=test['data'],
|
||||
encoding=test['encoding']):
|
||||
p = HTMLParser()
|
||||
t = p.parse(data, useChardet=False)
|
||||
|
||||
errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
|
||||
(data, repr(encoding.lower()),
|
||||
repr(p.tokenizer.stream.charEncoding)))
|
||||
self.assertEquals(encoding.lower(),
|
||||
p.tokenizer.stream.charEncoding[0],
|
||||
errorMessage)
|
||||
setattr(Html5EncodingTestCase, 'test_%s_%d' % (test_name, idx+1),
|
||||
encodingTest)
|
||||
|
||||
try:
|
||||
import chardet
|
||||
def test_chardet(self):
|
||||
data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
|
||||
encoding = inputstream.HTMLInputStream(data).charEncoding
|
||||
assert encoding[0].lower() == "big5"
|
||||
setattr(Html5EncodingTestCase, 'test_chardet', test_chardet)
|
||||
except ImportError:
|
||||
print "chardet not found, skipping chardet tests"
|
||||
|
||||
|
||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
||||
|
||||
def main():
|
||||
buildTestSuite()
|
||||
unittest.main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
296
html5lib/tests/test_formfiller.py
Normal file
296
html5lib/tests/test_formfiller.py
Normal file
@@ -0,0 +1,296 @@
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from html5lib.filters.formfiller import SimpleFilter
|
||||
|
||||
class FieldStorage(dict):
|
||||
def getlist(self, name):
|
||||
l = self[name]
|
||||
if isinstance(l, list):
|
||||
return l
|
||||
elif isinstance(l, tuple) or hasattr(l, '__iter__'):
|
||||
return list(l)
|
||||
return [l]
|
||||
|
||||
class TestCase(unittest.TestCase):
|
||||
def runTest(self, input, formdata, expected):
|
||||
try:
|
||||
output = list(SimpleFilter(input, formdata))
|
||||
except NotImplementedError, nie:
|
||||
# Amnesty for those that confess...
|
||||
print >>sys.stderr, "Not implemented:", str(nie)
|
||||
else:
|
||||
errorMsg = "\n".join(["\n\nInput:", str(input),
|
||||
"\nForm data:", str(formdata),
|
||||
"\nExpected:", str(expected),
|
||||
"\nReceived:", str(output)])
|
||||
self.assertEquals(output, expected, errorMsg)
|
||||
|
||||
def testSingleTextInputWithValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"quux")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"bar")]}])
|
||||
|
||||
def testSingleTextInputWithoutValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"text"), (u"name", u"foo")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"bar")]}])
|
||||
|
||||
def testSingleCheckbox(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}])
|
||||
|
||||
def testSingleCheckboxShouldBeUnchecked(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}])
|
||||
|
||||
def testSingleCheckboxCheckedByDefault(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}])
|
||||
|
||||
def testSingleCheckboxCheckedByDefaultShouldBeUnchecked(self):
|
||||
self.runTest(
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux"), (u"checked", u"")]}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"EmptyTag", "name": u"input",
|
||||
"data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}])
|
||||
|
||||
def testSingleTextareaWithValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"textarea", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"textarea", "data": []}])
|
||||
|
||||
def testSingleTextareaWithoutValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
|
||||
{"type": u"EndTag", "name": u"textarea", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"textarea", "data": []}])
|
||||
|
||||
def testSingleSelectWithValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectWithValueShouldBeUnselected(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "quux"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectWithoutValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectWithoutValueShouldBeUnselected(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "quux"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectTwoOptionsWithValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectTwoOptionsWithValueShouldBeUnselected(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"baz")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "quux"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"baz")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectTwoOptionsWithoutValue(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "bar"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectTwoOptionsWithoutValueShouldBeUnselected(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"baz"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": "quux"}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"bar"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": []},
|
||||
{"type": u"Characters", "data": u"baz"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testSingleSelectMultiple(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo"), (u"multiple", u"")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": ["bar", "quux"]}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo"), (u"multiple", u"")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def testTwoSelect(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []},
|
||||
{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}],
|
||||
FieldStorage({"foo": ["bar", "quux"]}),
|
||||
[{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []},
|
||||
{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux"), (u"selected", u"")]},
|
||||
{"type": u"Characters", "data": u"quux"},
|
||||
{"type": u"EndTag", "name": u"option", "data": []},
|
||||
{"type": u"EndTag", "name": u"select", "data": []}])
|
||||
|
||||
def buildTestSuite():
|
||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
||||
|
||||
def main():
|
||||
buildTestSuite()
|
||||
unittest.main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
140
html5lib/tests/test_parser.py
Normal file
140
html5lib/tests/test_parser.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
import StringIO
|
||||
import warnings
|
||||
import re
|
||||
|
||||
warnings.simplefilter("error")
|
||||
|
||||
from support import html5lib_test_files as data_files
|
||||
from support import TestData, convert, convertExpected
|
||||
import html5lib
|
||||
from html5lib import html5parser, treebuilders, constants
|
||||
|
||||
treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
|
||||
"DOM":treebuilders.getTreeBuilder("dom")}
|
||||
|
||||
#Try whatever etree implementations are avaliable from a list that are
|
||||
#"supposed" to work
|
||||
try:
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
try:
|
||||
import elementtree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import xml.etree.cElementTree as cElementTree
|
||||
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
try:
|
||||
import cElementTree
|
||||
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
try:
|
||||
import lxml.html as lxml
|
||||
except ImportError:
|
||||
import lxml.etree as lxml
|
||||
treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import BeautifulSoup
|
||||
treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
#Try whatever dom implementations are avaliable from a list that are
|
||||
#"supposed" to work
|
||||
try:
|
||||
import pxdom
|
||||
treeTypes["pxdom"] = treebuilders.getTreeBuilder("dom", pxdom)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
#Run the parse error checks
|
||||
checkParseErrors = False
|
||||
|
||||
#XXX - There should just be one function here but for some reason the testcase
|
||||
#format differs from the treedump format by a single space character
|
||||
def convertTreeDump(data):
|
||||
return "\n".join(convert(3)(data).split("\n")[1:])
|
||||
|
||||
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
|
||||
|
||||
|
||||
def runParserTest(innerHTML, input, expected, errors, treeClass,
|
||||
namespaceHTMLElements):
|
||||
#XXX - move this out into the setup function
|
||||
#concatenate all consecutive character tokens into a single token
|
||||
try:
|
||||
p = html5parser.HTMLParser(tree = treeClass,
|
||||
namespaceHTMLElements=namespaceHTMLElements)
|
||||
except constants.DataLossWarning:
|
||||
return
|
||||
|
||||
try:
|
||||
if innerHTML:
|
||||
document = p.parseFragment(input, innerHTML)
|
||||
else:
|
||||
try:
|
||||
document = p.parse(input)
|
||||
except constants.DataLossWarning:
|
||||
return
|
||||
except:
|
||||
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
|
||||
u"\nTraceback:", traceback.format_exc()])
|
||||
assert False, errorMsg.encode("utf8")
|
||||
|
||||
output = convertTreeDump(p.tree.testSerializer(document))
|
||||
|
||||
expected = convertExpected(expected)
|
||||
if namespaceHTMLElements:
|
||||
expected = namespaceExpected(r"\1<html \2>", expected)
|
||||
|
||||
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
|
||||
u"\nReceived:", output])
|
||||
assert expected == output, errorMsg.encode("utf8")
|
||||
errStr = [u"Line: %i Col: %i %s"%(line, col,
|
||||
constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
|
||||
((line,col), errorcode, datavars) in p.errors]
|
||||
|
||||
errorMsg2 = u"\n".join([u"\n\nInput:", input,
|
||||
u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
|
||||
u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
|
||||
if checkParseErrors:
|
||||
assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
|
||||
|
||||
def test_parser():
|
||||
sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
|
||||
files = data_files('tree-construction')
|
||||
|
||||
for filename in files:
|
||||
testName = os.path.basename(filename).replace(".dat","")
|
||||
|
||||
tests = TestData(filename, "data")
|
||||
|
||||
for index, test in enumerate(tests):
|
||||
input, errors, innerHTML, expected = [test[key] for key in
|
||||
'data', 'errors',
|
||||
'document-fragment',
|
||||
'document']
|
||||
if errors:
|
||||
errors = errors.split("\n")
|
||||
|
||||
for treeName, treeCls in treeTypes.iteritems():
|
||||
for namespaceHTMLElements in (True, False):
|
||||
print input
|
||||
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
|
||||
namespaceHTMLElements)
|
||||
break
|
||||
|
||||
|
||||
39
html5lib/tests/test_parser2.py
Executable file
39
html5lib/tests/test_parser2.py
Executable file
@@ -0,0 +1,39 @@
|
||||
import support
|
||||
from html5lib import html5parser
|
||||
from html5lib.constants import namespaces
|
||||
from html5lib.treebuilders import dom
|
||||
|
||||
import unittest
|
||||
|
||||
# tests that aren't autogenerated from text files
|
||||
class MoreParserTests(unittest.TestCase):
|
||||
|
||||
def test_assertDoctypeCloneable(self):
|
||||
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
|
||||
doc = parser.parse('<!DOCTYPE HTML>')
|
||||
self.assert_(doc.cloneNode(True))
|
||||
|
||||
def test_line_counter(self):
|
||||
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
|
||||
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
|
||||
parser.parse("<pre>\nx\n>\n</pre>")
|
||||
|
||||
def test_namespace_html_elements_0(self):
|
||||
parser = html5parser.HTMLParser(namespaceHTMLElements=True)
|
||||
doc = parser.parse("<html></html>")
|
||||
self.assert_(doc.childNodes[0].namespace == namespaces["html"])
|
||||
|
||||
def test_namespace_html_elements_1(self):
|
||||
parser = html5parser.HTMLParser(namespaceHTMLElements=False)
|
||||
doc = parser.parse("<html></html>")
|
||||
self.assert_(doc.childNodes[0].namespace == None)
|
||||
|
||||
def buildTestSuite():
|
||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
||||
|
||||
def main():
|
||||
buildTestSuite()
|
||||
unittest.main()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
76
html5lib/tests/test_sanitizer.py
Normal file
76
html5lib/tests/test_sanitizer.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
import simplejson as json
|
||||
|
||||
from html5lib import html5parser, sanitizer, constants
|
||||
|
||||
def runSanitizerTest(name, expected, input):
|
||||
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
|
||||
parseFragment(expected).childNodes])
|
||||
expected = json.loads(json.dumps(expected))
|
||||
assert expected == sanitize_html(input)
|
||||
|
||||
def sanitize_html(stream):
|
||||
return ''.join([token.toxml() for token in
|
||||
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
|
||||
parseFragment(stream).childNodes])
|
||||
|
||||
def test_should_handle_astral_plane_characters():
|
||||
assert u"<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>𝒵 𝔸</p>")
|
||||
|
||||
def test_sanitizer():
|
||||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
|
||||
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
|
||||
continue ### TODO
|
||||
if tag_name != tag_name.lower():
|
||||
continue ### TODO
|
||||
if tag_name == 'image':
|
||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
||||
"<img title=\"1\"/>foo <bad>bar</bad> baz",
|
||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
|
||||
elif tag_name == 'br':
|
||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
||||
"<br title=\"1\"/>foo <bad>bar</bad> baz<br/>",
|
||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
|
||||
elif tag_name in constants.voidElements:
|
||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
||||
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name,
|
||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
|
||||
else:
|
||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
||||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name),
|
||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
|
||||
|
||||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
|
||||
tag_name = tag_name.upper()
|
||||
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
|
||||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name),
|
||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
|
||||
|
||||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
|
||||
if attribute_name != attribute_name.lower(): continue ### TODO
|
||||
if attribute_name == 'style': continue
|
||||
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
|
||||
"<p %s=\"foo\">foo <bad>bar</bad> baz</p>" % attribute_name,
|
||||
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
|
||||
|
||||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
|
||||
attribute_name = attribute_name.upper()
|
||||
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
|
||||
"<p>foo <bad>bar</bad> baz</p>",
|
||||
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
|
||||
|
||||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
|
||||
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
|
||||
"<a href=\"%s\">foo</a>" % protocol,
|
||||
"""<a href="%s">foo</a>""" % protocol)
|
||||
|
||||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
|
||||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
|
||||
"<a href=\"%s\">foo</a>" % protocol,
|
||||
"""<a href="%s">foo</a>""" % protocol)
|
||||
180
html5lib/tests/test_serializer.py
Normal file
180
html5lib/tests/test_serializer.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import os
|
||||
import unittest
|
||||
from support import html5lib_test_files
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
import simplejson as json
|
||||
|
||||
import html5lib
|
||||
from html5lib import html5parser, serializer, constants
|
||||
from html5lib.treewalkers._base import TreeWalker
|
||||
|
||||
optionals_loaded = []
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
optionals_loaded.append("lxml")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
default_namespace = constants.namespaces["html"]
|
||||
|
||||
class JsonWalker(TreeWalker):
|
||||
def __iter__(self):
|
||||
for token in self.tree:
|
||||
type = token[0]
|
||||
if type == "StartTag":
|
||||
if len(token) == 4:
|
||||
namespace, name, attrib = token[1:4]
|
||||
else:
|
||||
namespace = default_namespace
|
||||
name, attrib = token[1:3]
|
||||
yield self.startTag(namespace, name, self._convertAttrib(attrib))
|
||||
elif type == "EndTag":
|
||||
if len(token) == 3:
|
||||
namespace, name = token[1:3]
|
||||
else:
|
||||
namespace = default_namespace
|
||||
name = token[1]
|
||||
yield self.endTag(namespace, name)
|
||||
elif type == "EmptyTag":
|
||||
if len(token) == 4:
|
||||
namespace, name, attrib = token[1:]
|
||||
else:
|
||||
namespace = default_namespace
|
||||
name, attrib = token[1:]
|
||||
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
|
||||
yield token
|
||||
elif type == "Comment":
|
||||
yield self.comment(token[1])
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
for token in self.text(token[1]):
|
||||
yield token
|
||||
elif type == "Doctype":
|
||||
if len(token) == 4:
|
||||
yield self.doctype(token[1], token[2], token[3])
|
||||
elif len(token) == 3:
|
||||
yield self.doctype(token[1], token[2])
|
||||
else:
|
||||
yield self.doctype(token[1])
|
||||
else:
|
||||
raise ValueError("Unknown token type: " + type)
|
||||
|
||||
def _convertAttrib(self, attribs):
|
||||
"""html5lib tree-walkers use a dict of (namespace, name): value for
|
||||
attributes, but JSON cannot represent this. Convert from the format
|
||||
in the serializer tests (a list of dicts with "namespace", "name",
|
||||
and "value" as keys) to html5lib's tree-walker format."""
|
||||
attrs = {}
|
||||
for attrib in attribs:
|
||||
name = (attrib["namespace"], attrib["name"])
|
||||
assert(name not in attrs)
|
||||
attrs[name] = attrib["value"]
|
||||
return attrs
|
||||
|
||||
|
||||
def serialize_html(input, options):
|
||||
options = dict([(str(k),v) for k,v in options.iteritems()])
|
||||
return serializer.HTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
|
||||
|
||||
def serialize_xhtml(input, options):
|
||||
options = dict([(str(k),v) for k,v in options.iteritems()])
|
||||
return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
|
||||
|
||||
def make_test(input, expected, xhtml, options):
|
||||
result = serialize_html(input, options)
|
||||
if len(expected) == 1:
|
||||
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
|
||||
elif result not in expected:
|
||||
assert False, "Expected: %s, Received: %s" % (expected, result)
|
||||
|
||||
if not xhtml:
|
||||
return
|
||||
|
||||
result = serialize_xhtml(input, options)
|
||||
if len(xhtml) == 1:
|
||||
assert xhtml[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:True\n%s"%(xhtml[0], result, str(options))
|
||||
elif result not in xhtml:
|
||||
assert False, "Expected: %s, Received: %s" % (xhtml, result)
|
||||
|
||||
|
||||
class EncodingTestCase(unittest.TestCase):
|
||||
def throwsWithLatin1(self, input):
|
||||
self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
|
||||
|
||||
def testDoctypeName(self):
|
||||
self.throwsWithLatin1([["Doctype", u"\u0101"]])
|
||||
|
||||
def testDoctypePublicId(self):
|
||||
self.throwsWithLatin1([["Doctype", u"potato", u"\u0101"]])
|
||||
|
||||
def testDoctypeSystemId(self):
|
||||
self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
|
||||
|
||||
def testCdataCharacters(self):
|
||||
self.assertEquals("<style>ā", serialize_html([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}],
|
||||
["Characters", u"\u0101"]],
|
||||
{"encoding": "iso-8859-1"}))
|
||||
|
||||
def testCharacters(self):
|
||||
self.assertEquals("ā", serialize_html([["Characters", u"\u0101"]],
|
||||
{"encoding": "iso-8859-1"}))
|
||||
|
||||
def testStartTagName(self):
|
||||
self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
|
||||
|
||||
def testEmptyTagName(self):
|
||||
self.throwsWithLatin1([["EmptyTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
|
||||
|
||||
def testAttributeName(self):
|
||||
self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"span", [{"namespace": None, "name": u"\u0101", "value": u"potato"}]]])
|
||||
|
||||
def testAttributeValue(self):
|
||||
self.assertEquals("<span potato=ā>", serialize_html([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
|
||||
[{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
|
||||
{"encoding": "iso-8859-1"}))
|
||||
|
||||
def testEndTagName(self):
|
||||
self.throwsWithLatin1([["EndTag", u"http://www.w3.org/1999/xhtml", u"\u0101"]])
|
||||
|
||||
def testComment(self):
|
||||
self.throwsWithLatin1([["Comment", u"\u0101"]])
|
||||
|
||||
|
||||
if "lxml" in optionals_loaded:
|
||||
class LxmlTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.parser = etree.XMLParser(resolve_entities=False)
|
||||
self.treewalker = html5lib.getTreeWalker("lxml")
|
||||
self.serializer = serializer.HTMLSerializer()
|
||||
|
||||
def testEntityReplacement(self):
|
||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>"""
|
||||
tree = etree.fromstring(doc, parser = self.parser).getroottree()
|
||||
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
|
||||
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
|
||||
|
||||
def testEntityXML(self):
|
||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>"""
|
||||
tree = etree.fromstring(doc, parser = self.parser).getroottree()
|
||||
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
|
||||
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>""", result)
|
||||
|
||||
def testEntityNoResolve(self):
|
||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>"""
|
||||
tree = etree.fromstring(doc, parser = self.parser).getroottree()
|
||||
result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
|
||||
resolve_entities=False)
|
||||
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>""", result)
|
||||
|
||||
def test_serializer():
|
||||
for filename in html5lib_test_files('serializer', '*.test'):
|
||||
tests = json.load(file(filename))
|
||||
test_name = os.path.basename(filename).replace('.test','')
|
||||
for index, test in enumerate(tests['tests']):
|
||||
xhtml = test.get("xhtml", test["expected"])
|
||||
if test_name == 'optionaltags':
|
||||
xhtml = None
|
||||
yield make_test, test["input"], test["expected"], xhtml, test.get("options", {})
|
||||
97
html5lib/tests/test_stream.py
Executable file
97
html5lib/tests/test_stream.py
Executable file
@@ -0,0 +1,97 @@
|
||||
import support
|
||||
import unittest, codecs
|
||||
|
||||
from html5lib.inputstream import HTMLInputStream
|
||||
|
||||
class HTMLInputStreamShortChunk(HTMLInputStream):
|
||||
_defaultChunkSize = 2
|
||||
|
||||
class HTMLInputStreamTest(unittest.TestCase):
|
||||
|
||||
def test_char_ascii(self):
|
||||
stream = HTMLInputStream("'", encoding='ascii')
|
||||
self.assertEquals(stream.charEncoding[0], 'ascii')
|
||||
self.assertEquals(stream.char(), "'")
|
||||
|
||||
def test_char_null(self):
|
||||
stream = HTMLInputStream("\x00")
|
||||
self.assertEquals(stream.char(), u'\ufffd')
|
||||
|
||||
def test_char_utf8(self):
|
||||
stream = HTMLInputStream(u'\u2018'.encode('utf-8'), encoding='utf-8')
|
||||
self.assertEquals(stream.charEncoding[0], 'utf-8')
|
||||
self.assertEquals(stream.char(), u'\u2018')
|
||||
|
||||
def test_char_win1252(self):
|
||||
stream = HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))
|
||||
self.assertEquals(stream.charEncoding[0], 'windows-1252')
|
||||
self.assertEquals(stream.char(), u"\xa9")
|
||||
self.assertEquals(stream.char(), u"\xf1")
|
||||
self.assertEquals(stream.char(), u"\u2019")
|
||||
|
||||
def test_bom(self):
|
||||
stream = HTMLInputStream(codecs.BOM_UTF8 + "'")
|
||||
self.assertEquals(stream.charEncoding[0], 'utf-8')
|
||||
self.assertEquals(stream.char(), "'")
|
||||
|
||||
def test_utf_16(self):
|
||||
stream = HTMLInputStream((' '*1025).encode('utf-16'))
|
||||
self.assert_(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding)
|
||||
self.assertEquals(len(stream.charsUntil(' ', True)), 1025)
|
||||
|
||||
def test_newlines(self):
|
||||
stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe")
|
||||
self.assertEquals(stream.position(), (1, 0))
|
||||
self.assertEquals(stream.charsUntil('c'), u"a\nbb\n")
|
||||
self.assertEquals(stream.position(), (3, 0))
|
||||
self.assertEquals(stream.charsUntil('x'), u"ccc\ndddd")
|
||||
self.assertEquals(stream.position(), (4, 4))
|
||||
self.assertEquals(stream.charsUntil('e'), u"x")
|
||||
self.assertEquals(stream.position(), (4, 5))
|
||||
|
||||
def test_newlines2(self):
|
||||
size = HTMLInputStream._defaultChunkSize
|
||||
stream = HTMLInputStream("\r" * size + "\n")
|
||||
self.assertEquals(stream.charsUntil('x'), "\n" * size)
|
||||
|
||||
def test_position(self):
|
||||
stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\nccc\nddde\nf\ngh")
|
||||
self.assertEquals(stream.position(), (1, 0))
|
||||
self.assertEquals(stream.charsUntil('c'), u"a\nbb\n")
|
||||
self.assertEquals(stream.position(), (3, 0))
|
||||
stream.unget(u"\n")
|
||||
self.assertEquals(stream.position(), (2, 2))
|
||||
self.assertEquals(stream.charsUntil('c'), u"\n")
|
||||
self.assertEquals(stream.position(), (3, 0))
|
||||
stream.unget(u"\n")
|
||||
self.assertEquals(stream.position(), (2, 2))
|
||||
self.assertEquals(stream.char(), u"\n")
|
||||
self.assertEquals(stream.position(), (3, 0))
|
||||
self.assertEquals(stream.charsUntil('e'), u"ccc\nddd")
|
||||
self.assertEquals(stream.position(), (4, 3))
|
||||
self.assertEquals(stream.charsUntil('h'), u"e\nf\ng")
|
||||
self.assertEquals(stream.position(), (6, 1))
|
||||
|
||||
def test_position2(self):
|
||||
stream = HTMLInputStreamShortChunk("abc\nd")
|
||||
self.assertEquals(stream.position(), (1, 0))
|
||||
self.assertEquals(stream.char(), u"a")
|
||||
self.assertEquals(stream.position(), (1, 1))
|
||||
self.assertEquals(stream.char(), u"b")
|
||||
self.assertEquals(stream.position(), (1, 2))
|
||||
self.assertEquals(stream.char(), u"c")
|
||||
self.assertEquals(stream.position(), (1, 3))
|
||||
self.assertEquals(stream.char(), u"\n")
|
||||
self.assertEquals(stream.position(), (2, 0))
|
||||
self.assertEquals(stream.char(), u"d")
|
||||
self.assertEquals(stream.position(), (2, 1))
|
||||
|
||||
def buildTestSuite():
|
||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
||||
|
||||
def main():
|
||||
buildTestSuite()
|
||||
unittest.main()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
193
html5lib/tests/test_tokenizer.py
Normal file
193
html5lib/tests/test_tokenizer.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import sys
|
||||
import os
|
||||
import unittest
|
||||
import cStringIO
|
||||
import warnings
|
||||
import re
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
import simplejson as json
|
||||
|
||||
from support import html5lib_test_files
|
||||
from html5lib.tokenizer import HTMLTokenizer
|
||||
from html5lib import constants
|
||||
|
||||
class TokenizerTestParser(object):
|
||||
def __init__(self, initialState, lastStartTag=None):
|
||||
self.tokenizer = HTMLTokenizer
|
||||
self._state = initialState
|
||||
self._lastStartTag = lastStartTag
|
||||
|
||||
def parse(self, stream, encoding=None, innerHTML=False):
|
||||
tokenizer = self.tokenizer(stream, encoding)
|
||||
self.outputTokens = []
|
||||
|
||||
tokenizer.state = getattr(tokenizer, self._state)
|
||||
if self._lastStartTag is not None:
|
||||
tokenizer.currentToken = {"type": "startTag",
|
||||
"name":self._lastStartTag}
|
||||
|
||||
types = dict((v,k) for k,v in constants.tokenTypes.iteritems())
|
||||
for token in tokenizer:
|
||||
getattr(self, 'process%s' % types[token["type"]])(token)
|
||||
|
||||
return self.outputTokens
|
||||
|
||||
def processDoctype(self, token):
|
||||
self.outputTokens.append([u"DOCTYPE", token["name"], token["publicId"],
|
||||
token["systemId"], token["correct"]])
|
||||
|
||||
def processStartTag(self, token):
|
||||
self.outputTokens.append([u"StartTag", token["name"],
|
||||
dict(token["data"][::-1]), token["selfClosing"]])
|
||||
|
||||
def processEmptyTag(self, token):
|
||||
if token["name"] not in constants.voidElements:
|
||||
self.outputTokens.append(u"ParseError")
|
||||
self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
|
||||
|
||||
def processEndTag(self, token):
|
||||
self.outputTokens.append([u"EndTag", token["name"],
|
||||
token["selfClosing"]])
|
||||
|
||||
def processComment(self, token):
|
||||
self.outputTokens.append([u"Comment", token["data"]])
|
||||
|
||||
def processSpaceCharacters(self, token):
|
||||
self.outputTokens.append([u"Character", token["data"]])
|
||||
self.processSpaceCharacters = self.processCharacters
|
||||
|
||||
def processCharacters(self, token):
|
||||
self.outputTokens.append([u"Character", token["data"]])
|
||||
|
||||
def processEOF(self, token):
|
||||
pass
|
||||
|
||||
def processParseError(self, token):
|
||||
self.outputTokens.append([u"ParseError", token["data"]])
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
outputTokens = []
|
||||
for token in tokens:
|
||||
if not "ParseError" in token and token[0] == "Character":
|
||||
if (outputTokens and not "ParseError" in outputTokens[-1] and
|
||||
outputTokens[-1][0] == "Character"):
|
||||
outputTokens[-1][1] += token[1]
|
||||
else:
|
||||
outputTokens.append(token)
|
||||
else:
|
||||
outputTokens.append(token)
|
||||
return outputTokens
|
||||
|
||||
def normalizeTokens(tokens):
|
||||
# TODO: convert tests to reflect arrays
|
||||
for i, token in enumerate(tokens):
|
||||
if token[0] == u'ParseError':
|
||||
tokens[i] = token[0]
|
||||
return tokens
|
||||
|
||||
def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
|
||||
ignoreErrors=False):
|
||||
"""Test whether the test has passed or failed
|
||||
|
||||
If the ignoreErrorOrder flag is set to true we don't test the relative
|
||||
positions of parse errors and non parse errors
|
||||
"""
|
||||
checkSelfClosing= False
|
||||
for token in expectedTokens:
|
||||
if (token[0] == "StartTag" and len(token) == 4
|
||||
or token[0] == "EndTag" and len(token) == 3):
|
||||
checkSelfClosing = True
|
||||
break
|
||||
|
||||
if not checkSelfClosing:
|
||||
for token in receivedTokens:
|
||||
if token[0] == "StartTag" or token[0] == "EndTag":
|
||||
token.pop()
|
||||
|
||||
if not ignoreErrorOrder and not ignoreErrors:
|
||||
return expectedTokens == receivedTokens
|
||||
else:
|
||||
#Sort the tokens into two groups; non-parse errors and parse errors
|
||||
tokens = {"expected":[[],[]], "received":[[],[]]}
|
||||
for tokenType, tokenList in zip(tokens.keys(),
|
||||
(expectedTokens, receivedTokens)):
|
||||
for token in tokenList:
|
||||
if token != "ParseError":
|
||||
tokens[tokenType][0].append(token)
|
||||
else:
|
||||
if not ignoreErrors:
|
||||
tokens[tokenType][1].append(token)
|
||||
return tokens["expected"] == tokens["received"]
|
||||
|
||||
def unescape_test(test):
|
||||
def decode(inp):
|
||||
return inp.decode("unicode-escape")
|
||||
|
||||
test["input"] = decode(test["input"])
|
||||
for token in test["output"]:
|
||||
if token == "ParseError":
|
||||
continue
|
||||
else:
|
||||
token[1] = decode(token[1])
|
||||
if len(token) > 2:
|
||||
for key, value in token[2]:
|
||||
del token[2][key]
|
||||
token[2][decode(key)] = decode(value)
|
||||
return test
|
||||
|
||||
|
||||
def runTokenizerTest(test):
|
||||
#XXX - move this out into the setup function
|
||||
#concatenate all consecutive character tokens into a single token
|
||||
if 'doubleEscaped' in test:
|
||||
test = unescape_test(test)
|
||||
|
||||
expected = concatenateCharacterTokens(test['output'])
|
||||
if 'lastStartTag' not in test:
|
||||
test['lastStartTag'] = None
|
||||
outBuffer = cStringIO.StringIO()
|
||||
stdout = sys.stdout
|
||||
sys.stdout = outBuffer
|
||||
parser = TokenizerTestParser(test['initialState'],
|
||||
test['lastStartTag'])
|
||||
tokens = parser.parse(test['input'])
|
||||
tokens = concatenateCharacterTokens(tokens)
|
||||
received = normalizeTokens(tokens)
|
||||
errorMsg = u"\n".join(["\n\nInitial state:",
|
||||
test['initialState'] ,
|
||||
"\nInput:", unicode(test['input']),
|
||||
"\nExpected:", unicode(expected),
|
||||
"\nreceived:", unicode(tokens)])
|
||||
errorMsg = errorMsg.encode("utf-8")
|
||||
ignoreErrorOrder = test.get('ignoreErrorOrder', False)
|
||||
assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
|
||||
|
||||
|
||||
def _doCapitalize(match):
|
||||
return match.group(1).upper()
|
||||
|
||||
_capitalizeRe = re.compile(r"\W+(\w)").sub
|
||||
|
||||
def capitalize(s):
|
||||
s = s.lower()
|
||||
s = _capitalizeRe(_doCapitalize, s)
|
||||
return s
|
||||
|
||||
|
||||
def test_tokenizer():
|
||||
for filename in html5lib_test_files('tokenizer', '*.test'):
|
||||
tests = json.load(file(filename))
|
||||
testName = os.path.basename(filename).replace(".test","")
|
||||
if 'tests' in tests:
|
||||
for index,test in enumerate(tests['tests']):
|
||||
#Skip tests with a self closing flag
|
||||
skip = False
|
||||
if 'initialStates' not in test:
|
||||
test["initialStates"] = ["Data state"]
|
||||
for initialState in test["initialStates"]:
|
||||
test["initialState"] = capitalize(initialState)
|
||||
yield runTokenizerTest, test
|
||||
|
||||
311
html5lib/tests/test_treewalkers.py
Normal file
311
html5lib/tests/test_treewalkers.py
Normal file
@@ -0,0 +1,311 @@
|
||||
import os
|
||||
import sys
|
||||
import StringIO
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
warnings.simplefilter("error")
|
||||
|
||||
from support import html5lib_test_files, TestData, convertExpected
|
||||
|
||||
from html5lib import html5parser, treewalkers, treebuilders, constants
|
||||
from html5lib.filters.lint import Filter as LintFilter, LintError
|
||||
|
||||
def PullDOMAdapter(node):
|
||||
from xml.dom import Node
|
||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, COMMENT, CHARACTERS
|
||||
|
||||
if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
for childNode in node.childNodes:
|
||||
for event in PullDOMAdapter(childNode):
|
||||
yield event
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
raise NotImplementedError("DOCTYPE nodes are not supported by PullDOM")
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
yield COMMENT, node
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
yield CHARACTERS, node
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
yield START_ELEMENT, node
|
||||
for childNode in node.childNodes:
|
||||
for event in PullDOMAdapter(childNode):
|
||||
yield event
|
||||
yield END_ELEMENT, node
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Node type not supported: " + str(node.nodeType))
|
||||
|
||||
treeTypes = {
|
||||
"simpletree": {"builder": treebuilders.getTreeBuilder("simpletree"),
|
||||
"walker": treewalkers.getTreeWalker("simpletree")},
|
||||
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
|
||||
"walker": treewalkers.getTreeWalker("dom")},
|
||||
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
|
||||
"adapter": PullDOMAdapter,
|
||||
"walker": treewalkers.getTreeWalker("pulldom")},
|
||||
}
|
||||
|
||||
#Try whatever etree implementations are available from a list that are
|
||||
#"supposed" to work
|
||||
try:
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
||||
except ImportError:
|
||||
try:
|
||||
import elementtree.ElementTree as ElementTree
|
||||
treeTypes['ElementTree'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import xml.etree.cElementTree as ElementTree
|
||||
treeTypes['cElementTree'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
||||
except ImportError:
|
||||
try:
|
||||
import cElementTree as ElementTree
|
||||
treeTypes['cElementTree'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import lxml.etree as ElementTree
|
||||
# treeTypes['lxml_as_etree'] = \
|
||||
# {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
||||
# "walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
||||
treeTypes['lxml_native'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("lxml"),
|
||||
"walker": treewalkers.getTreeWalker("lxml")}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import BeautifulSoup
|
||||
treeTypes["beautifulsoup"] = \
|
||||
{"builder": treebuilders.getTreeBuilder("beautifulsoup"),
|
||||
"walker": treewalkers.getTreeWalker("beautifulsoup")}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
#Try whatever etree implementations are available from a list that are
|
||||
#"supposed" to work
|
||||
try:
|
||||
import pxdom
|
||||
treeTypes['pxdom'] = \
|
||||
{"builder": treebuilders.getTreeBuilder("dom", pxdom),
|
||||
"walker": treewalkers.getTreeWalker("dom")}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from genshi.core import QName, Attrs
|
||||
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||
|
||||
def GenshiAdapter(tree):
|
||||
text = None
|
||||
for token in treewalkers.getTreeWalker("simpletree")(tree):
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
if text is None:
|
||||
text = token["data"]
|
||||
else:
|
||||
text += token["data"]
|
||||
elif text is not None:
|
||||
yield TEXT, text, (None, -1, -1)
|
||||
text = None
|
||||
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
if token["namespace"]:
|
||||
name = u"{%s}%s" % (token["namespace"], token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
yield (START,
|
||||
(QName(name),
|
||||
Attrs([(QName(attr),value) for attr,value in token["data"]])),
|
||||
(None, -1, -1))
|
||||
if type == "EmptyTag":
|
||||
type = "EndTag"
|
||||
|
||||
if type == "EndTag":
|
||||
yield END, QName(token["name"]), (None, -1, -1)
|
||||
|
||||
elif type == "Comment":
|
||||
yield COMMENT, token["data"], (None, -1, -1)
|
||||
|
||||
elif type == "Doctype":
|
||||
yield DOCTYPE, (token["name"], token["publicId"],
|
||||
token["systemId"]), (None, -1, -1)
|
||||
|
||||
else:
|
||||
pass # FIXME: What to do?
|
||||
|
||||
if text is not None:
|
||||
yield TEXT, text, (None, -1, -1)
|
||||
|
||||
#treeTypes["genshi"] = \
|
||||
# {"builder": treebuilders.getTreeBuilder("simpletree"),
|
||||
# "adapter": GenshiAdapter,
|
||||
# "walker": treewalkers.getTreeWalker("genshi")}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
charactersToken = None
|
||||
for token in tokens:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
if charactersToken is None:
|
||||
charactersToken = {"type": "Characters", "data": token["data"]}
|
||||
else:
|
||||
charactersToken["data"] += token["data"]
|
||||
else:
|
||||
if charactersToken is not None:
|
||||
yield charactersToken
|
||||
charactersToken = None
|
||||
yield token
|
||||
if charactersToken is not None:
|
||||
yield charactersToken
|
||||
|
||||
def convertTokens(tokens):
|
||||
output = []
|
||||
indent = 0
|
||||
for token in concatenateCharacterTokens(tokens):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
if (token["namespace"] and
|
||||
token["namespace"] != constants.namespaces["html"]):
|
||||
if token["namespace"] in constants.prefixes:
|
||||
name = constants.prefixes[token["namespace"]]
|
||||
else:
|
||||
name = token["namespace"]
|
||||
name += u" " + token["name"]
|
||||
else:
|
||||
name = token["name"]
|
||||
output.append(u"%s<%s>" % (" "*indent, name))
|
||||
indent += 2
|
||||
attrs = token["data"]
|
||||
if attrs:
|
||||
#TODO: Remove this if statement, attrs should always exist
|
||||
for (namespace,name),value in sorted(attrs.items()):
|
||||
if namespace:
|
||||
if namespace in constants.prefixes:
|
||||
outputname = constants.prefixes[namespace]
|
||||
else:
|
||||
outputname = namespace
|
||||
outputname += u" " + name
|
||||
else:
|
||||
outputname = name
|
||||
output.append(u"%s%s=\"%s\"" % (" "*indent, outputname, value))
|
||||
if type == "EmptyTag":
|
||||
indent -= 2
|
||||
elif type == "EndTag":
|
||||
indent -= 2
|
||||
elif type == "Comment":
|
||||
output.append("%s<!-- %s -->" % (" "*indent, token["data"]))
|
||||
elif type == "Doctype":
|
||||
if token["name"]:
|
||||
if token["publicId"]:
|
||||
output.append("""%s<!DOCTYPE %s "%s" "%s">"""%
|
||||
(" "*indent, token["name"],
|
||||
token["publicId"],
|
||||
token["systemId"] and token["systemId"] or ""))
|
||||
elif token["systemId"]:
|
||||
output.append("""%s<!DOCTYPE %s "" "%s">"""%
|
||||
(" "*indent, token["name"],
|
||||
token["systemId"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE %s>"%(" "*indent,
|
||||
token["name"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE >" % (" "*indent,))
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
output.append("%s\"%s\"" % (" "*indent, token["data"]))
|
||||
else:
|
||||
pass # TODO: what to do with errors?
|
||||
return u"\n".join(output)
|
||||
|
||||
import re
|
||||
attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+",re.M)
|
||||
def sortattrs(x):
|
||||
lines = x.group(0).split("\n")
|
||||
lines.sort()
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class TokenTestCase(unittest.TestCase):
|
||||
def test_all_tokens(self):
|
||||
expected = [
|
||||
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'},
|
||||
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
|
||||
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
|
||||
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
|
||||
{'data': u'a', 'type': 'Characters'},
|
||||
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
|
||||
{'data': u'b', 'type': 'Characters'},
|
||||
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
|
||||
{'data': u'c', 'type': 'Characters'},
|
||||
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
|
||||
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'}
|
||||
]
|
||||
for treeName, treeCls in treeTypes.iteritems():
|
||||
p = html5parser.HTMLParser(tree = treeCls["builder"])
|
||||
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
|
||||
document = treeCls.get("adapter", lambda x: x)(document)
|
||||
output = treeCls["walker"](document)
|
||||
for expectedToken, outputToken in zip(expected, output):
|
||||
self.assertEquals(expectedToken, outputToken)
|
||||
|
||||
def run_test(innerHTML, input, expected, errors, treeClass):
|
||||
try:
|
||||
p = html5parser.HTMLParser(tree = treeClass["builder"])
|
||||
if innerHTML:
|
||||
document = p.parseFragment(StringIO.StringIO(input), innerHTML)
|
||||
else:
|
||||
document = p.parse(StringIO.StringIO(input))
|
||||
except constants.DataLossWarning:
|
||||
#Ignore testcases we know we don't pass
|
||||
return
|
||||
|
||||
document = treeClass.get("adapter", lambda x: x)(document)
|
||||
try:
|
||||
output = convertTokens(treeClass["walker"](document))
|
||||
output = attrlist.sub(sortattrs, output)
|
||||
expected = attrlist.sub(sortattrs, convertExpected(expected))
|
||||
assert expected == output, "\n".join([
|
||||
"", "Input:", input,
|
||||
"", "Expected:", expected,
|
||||
"", "Received:", output
|
||||
])
|
||||
except NotImplementedError:
|
||||
pass # Amnesty for those that confess...
|
||||
|
||||
def test_treewalker():
|
||||
sys.stdout.write('Testing tree walkers '+ " ".join(treeTypes.keys()) + "\n")
|
||||
|
||||
for treeName, treeCls in treeTypes.iteritems():
|
||||
files = html5lib_test_files('tree-construction')
|
||||
for filename in files:
|
||||
testName = os.path.basename(filename).replace(".dat","")
|
||||
|
||||
tests = TestData(filename, "data")
|
||||
|
||||
for index, test in enumerate(tests):
|
||||
(input, errors,
|
||||
innerHTML, expected) = [test[key] for key in ("data", "errors",
|
||||
"document-fragment",
|
||||
"document")]
|
||||
errors = errors.split("\n")
|
||||
yield run_test, innerHTML, input, expected, errors, treeCls
|
||||
|
||||
|
||||
123
html5lib/tests/test_whitespace_filter.py
Normal file
123
html5lib/tests/test_whitespace_filter.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import unittest
|
||||
|
||||
from html5lib.filters.whitespace import Filter
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class TestCase(unittest.TestCase):
|
||||
def runTest(self, input, expected):
|
||||
output = list(Filter(input))
|
||||
errorMsg = "\n".join(["\n\nInput:", str(input),
|
||||
"\nExpected:", str(expected),
|
||||
"\nReceived:", str(output)])
|
||||
self.assertEquals(output, expected, errorMsg)
|
||||
|
||||
def runTestUnmodifiedOutput(self, input):
|
||||
self.runTest(input, input)
|
||||
|
||||
def testPhrasingElements(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"Characters", "data": u"This is a " },
|
||||
{"type": u"StartTag", "name": u"span", "data": [] },
|
||||
{"type": u"Characters", "data": u"phrase" },
|
||||
{"type": u"EndTag", "name": u"span", "data": []},
|
||||
{"type": u"SpaceCharacters", "data": u" " },
|
||||
{"type": u"Characters", "data": u"with" },
|
||||
{"type": u"SpaceCharacters", "data": u" " },
|
||||
{"type": u"StartTag", "name": u"em", "data": [] },
|
||||
{"type": u"Characters", "data": u"emphasised text" },
|
||||
{"type": u"EndTag", "name": u"em", "data": []},
|
||||
{"type": u"Characters", "data": u" and an " },
|
||||
{"type": u"StartTag", "name": u"img", "data": [[u"alt", u"image"]] },
|
||||
{"type": u"Characters", "data": u"." }])
|
||||
|
||||
def testLeadingWhitespace(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"SpaceCharacters", "data": spaceCharacters},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}],
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"SpaceCharacters", "data": u" "},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}])
|
||||
|
||||
def testLeadingWhitespaceAsCharacters(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": spaceCharacters + u"foo"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}],
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u" foo"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}])
|
||||
|
||||
def testTrailingWhitespace(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"SpaceCharacters", "data": spaceCharacters},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}],
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"SpaceCharacters", "data": u" "},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}])
|
||||
|
||||
def testTrailingWhitespaceAsCharacters(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo" + spaceCharacters},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}],
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo "},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}])
|
||||
|
||||
def testWhitespace(self):
|
||||
self.runTest(
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo" + spaceCharacters + "bar"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}],
|
||||
[{"type": u"StartTag", "name": u"p", "data": []},
|
||||
{"type": u"Characters", "data": u"foo bar"},
|
||||
{"type": u"EndTag", "name": u"p", "data": []}])
|
||||
|
||||
def testLeadingWhitespaceInPre(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"StartTag", "name": u"pre", "data": []},
|
||||
{"type": u"SpaceCharacters", "data": spaceCharacters},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"EndTag", "name": u"pre", "data": []}])
|
||||
|
||||
def testLeadingWhitespaceAsCharactersInPre(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"StartTag", "name": u"pre", "data": []},
|
||||
{"type": u"Characters", "data": spaceCharacters + u"foo"},
|
||||
{"type": u"EndTag", "name": u"pre", "data": []}])
|
||||
|
||||
def testTrailingWhitespaceInPre(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"StartTag", "name": u"pre", "data": []},
|
||||
{"type": u"Characters", "data": u"foo"},
|
||||
{"type": u"SpaceCharacters", "data": spaceCharacters},
|
||||
{"type": u"EndTag", "name": u"pre", "data": []}])
|
||||
|
||||
def testTrailingWhitespaceAsCharactersInPre(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"StartTag", "name": u"pre", "data": []},
|
||||
{"type": u"Characters", "data": u"foo" + spaceCharacters},
|
||||
{"type": u"EndTag", "name": u"pre", "data": []}])
|
||||
|
||||
def testWhitespaceInPre(self):
|
||||
self.runTestUnmodifiedOutput(
|
||||
[{"type": u"StartTag", "name": u"pre", "data": []},
|
||||
{"type": u"Characters", "data": u"foo" + spaceCharacters + "bar"},
|
||||
{"type": u"EndTag", "name": u"pre", "data": []}])
|
||||
|
||||
def buildTestSuite():
|
||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
||||
|
||||
def main():
|
||||
buildTestSuite()
|
||||
unittest.main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
10
html5lib/tests/testdata/encoding/test-yahoo-jp.dat
vendored
Normal file
10
html5lib/tests/testdata/encoding/test-yahoo-jp.dat
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
#data
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
|
||||
<!--京-->
|
||||
<title>Yahoo! JAPAN</title>
|
||||
<meta name="description" content="日本最大級のポータルサイト。検索、オークション、ニュース、メール、コミュニティ、ショッピング、など80以上のサービスを展開。あなたの生活をより豊かにする「ライフ・エンジン」を目指していきます。">
|
||||
<style type="text/css" media="all">
|
||||
#encoding
|
||||
euc_jp
|
||||
394
html5lib/tests/testdata/encoding/tests1.dat
vendored
Normal file
394
html5lib/tests/testdata/encoding/tests1.dat
vendored
Normal file
File diff suppressed because one or more lines are too long
115
html5lib/tests/testdata/encoding/tests2.dat
vendored
Normal file
115
html5lib/tests/testdata/encoding/tests2.dat
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
#data
|
||||
<meta
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<!
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta charset = "
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta charset=euc_jp
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta <meta charset='euc_jp'>
|
||||
#encoding
|
||||
euc_jp
|
||||
|
||||
#data
|
||||
<meta charset = 'euc_jp'>
|
||||
#encoding
|
||||
euc_jp
|
||||
|
||||
#data
|
||||
<!-- -->
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<!-- -->
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta http-equiv="Content-Type<meta charset="utf-8">
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<meta http-equiv="Content-Type" content="text/html; charset='utf-8">
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta charset =
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta charset= utf-8
|
||||
>
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<meta content = "text/html;
|
||||
#encoding
|
||||
windows-1252
|
||||
|
||||
#data
|
||||
<meta charset="UTF-16">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<meta charset="UTF-16LE">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<meta charset="UTF-16BE">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<html a=ñ>
|
||||
<meta charset="utf-8">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<html ñ>
|
||||
<meta charset="utf-8">
|
||||
#encoding
|
||||
utf-8
|
||||
|
||||
#data
|
||||
<html>ñ
|
||||
<meta charset="utf-8">
|
||||
#encoding
|
||||
utf-8
|
||||
501
html5lib/tests/testdata/sanitizer/tests1.dat
vendored
Normal file
501
html5lib/tests/testdata/sanitizer/tests1.dat
vendored
Normal file
@@ -0,0 +1,501 @@
|
||||
[
|
||||
{
|
||||
"name": "IE_Comments",
|
||||
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
|
||||
"output": ""
|
||||
},
|
||||
|
||||
{
|
||||
"name": "IE_Comments_2",
|
||||
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
|
||||
"output": "<script>alert('XSS');</script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "allow_colons_in_path_component",
|
||||
"input": "<a href=\"./this:that\">foo</a>",
|
||||
"output": "<a href='./this:that'>foo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "background_attribute",
|
||||
"input": "<div background=\"javascript:alert('XSS')\"></div>",
|
||||
"output": "<div/>",
|
||||
"xhtml": "<div></div>",
|
||||
"rexml": "<div></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "bgsound",
|
||||
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<bgsound src=\"javascript:alert('XSS');\"/>",
|
||||
"rexml": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "div_background_image_unicode_encoded",
|
||||
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
|
||||
"output": "<div style=''>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "div_expression",
|
||||
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
|
||||
"output": "<div style=''>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "double_open_angle_brackets",
|
||||
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
|
||||
"output": "<img src='http://ha.ckers.org/scriptlet.html'>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "double_open_angle_brackets_2",
|
||||
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
|
||||
"output": "<script src=\"http://ha.ckers.org/scriptlet.html\" <=\"\">",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "grave_accents",
|
||||
"input": "<img src=`javascript:alert('XSS')` />",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "img_dynsrc_lowsrc",
|
||||
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "img_vbscript",
|
||||
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "input_image",
|
||||
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<input type='image'/>",
|
||||
"rexml": "<input type='image' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "link_stylesheets",
|
||||
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
|
||||
"output": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"/>",
|
||||
"rexml": "<link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "link_stylesheets_2",
|
||||
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
|
||||
"output": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"/>",
|
||||
"rexml": "<link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "list_style_image",
|
||||
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
|
||||
"output": "<li style=''>foo</li>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "no_closing_script_tags",
|
||||
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\">",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit",
|
||||
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"output": "<script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_2",
|
||||
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_3",
|
||||
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
|
||||
"output": "<img src='http://ha.ckers.org/xss.js'/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_II",
|
||||
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "non_alpha_non_digit_III",
|
||||
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
|
||||
"output": "<a>foo</a>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "platypus",
|
||||
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
|
||||
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "protocol_resolution_in_script_tag",
|
||||
"input": "<script src=//ha.ckers.org/.j></script>",
|
||||
"output": "<script src=\"//ha.ckers.org/.j\"></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_anchors",
|
||||
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
|
||||
"output": "<a href='foo'><script>baz</script></a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_alt_attribute",
|
||||
"input": "<img alt='foo' onclick='bar' />",
|
||||
"output": "<img alt='foo'/>",
|
||||
"rexml": "<img alt='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_height_attribute",
|
||||
"input": "<img height='foo' onclick='bar' />",
|
||||
"output": "<img height='foo'/>",
|
||||
"rexml": "<img height='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_src_attribute",
|
||||
"input": "<img src='foo' onclick='bar' />",
|
||||
"output": "<img src='foo'/>",
|
||||
"rexml": "<img src='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_allow_image_width_attribute",
|
||||
"input": "<img width='foo' onclick='bar' />",
|
||||
"output": "<img width='foo'/>",
|
||||
"rexml": "<img width='foo' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_blank_text",
|
||||
"input": "",
|
||||
"output": ""
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_malformed_image_tags",
|
||||
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
|
||||
"output": "<img/><script>alert(\"XSS\")</script>\">",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_handle_non_html",
|
||||
"input": "abc",
|
||||
"output": "abc"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_ridiculous_hack",
|
||||
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_0",
|
||||
"input": "<img src=\"javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_1",
|
||||
"input": "<img src=javascript:alert('XSS') />",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_10",
|
||||
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_11",
|
||||
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_12",
|
||||
"input": "<img src=\"  javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_13",
|
||||
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_14",
|
||||
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_2",
|
||||
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_3",
|
||||
"input": "<img src='javascript:alert("XSS")' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_4",
|
||||
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_5",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_6",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_7",
|
||||
"input": "<img src='javascript:alert('XSS')' />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_8",
|
||||
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_not_fall_for_xss_image_hack_9",
|
||||
"input": "<img src=\"jav	ascript:alert('XSS');\" />",
|
||||
"output": "<img/>",
|
||||
"rexml": "<img />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_half_open_scripts",
|
||||
"input": "<img src=\"javascript:alert('XSS')\"",
|
||||
"output": "<img/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_invalid_script_tag",
|
||||
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"output": "<script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"></script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
|
||||
"input": "<<script>alert(\"XSS\");//<</script>",
|
||||
"output": "<<script>alert(\"XSS\");//<</script>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
|
||||
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
|
||||
"output": "<iframe src=\"http://ha.ckers.org/scriptlet.html\" <=\"\">",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_tag_broken_up_by_null",
|
||||
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
|
||||
"output": "<scr\ufffdipt>alert(\"XSS\")</scr\ufffdipt>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_sanitize_unclosed_script",
|
||||
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\">",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
|
||||
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
|
||||
"output": "<a title='1'>boo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
|
||||
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
|
||||
"output": "<a title='1'>boo</a>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
|
||||
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
|
||||
"output": "<img title='1'/>boo",
|
||||
"rexml": "<img title='1' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
|
||||
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
|
||||
"output": "<img title='1'/>boo",
|
||||
"rexml": "<img title='1' />"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "xml_base",
|
||||
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
|
||||
"output": "<div>foo</div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "xul",
|
||||
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
|
||||
"output": "<p style=''>fubar</p>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "quotes_in_attributes",
|
||||
"input": "<img src='foo' title='\"foo\" bar' />",
|
||||
"rexml": "<img src='foo' title='\"foo\" bar' />",
|
||||
"output": "<img title='"foo" bar' src='foo'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "uri_refs_in_svg_attributes",
|
||||
"input": "<rect fill='url(#foo)' />",
|
||||
"rexml": "<rect fill='url(#foo)'></rect>",
|
||||
"xhtml": "<rect fill='url(#foo)'></rect>",
|
||||
"output": "<rect fill='url(#foo)'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "absolute_uri_refs_in_svg_attributes",
|
||||
"input": "<rect fill='url(http://bad.com/) #fff' />",
|
||||
"rexml": "<rect fill=' #fff'></rect>",
|
||||
"xhtml": "<rect fill=' #fff'></rect>",
|
||||
"output": "<rect fill=' #fff'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "uri_ref_with_space_in svg_attribute",
|
||||
"input": "<rect fill='url(\n#foo)' />",
|
||||
"rexml": "<rect fill='url(\n#foo)'></rect>",
|
||||
"xhtml": "<rect fill='url(\n#foo)'></rect>",
|
||||
"output": "<rect fill='url(\n#foo)'/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "absolute_uri_ref_with_space_in svg_attribute",
|
||||
"input": "<rect fill=\"url(\nhttp://bad.com/)\" />",
|
||||
"rexml": "<rect fill=' '></rect>",
|
||||
"xhtml": "<rect fill=' '></rect>",
|
||||
"output": "<rect fill=' '/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "allow_html5_image_tag",
|
||||
"input": "<image src='foo' />",
|
||||
"rexml": "<image src=\"foo\"></image>",
|
||||
"output": "<image src=\"foo\"/>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "style_attr_end_with_nothing",
|
||||
"input": "<div style=\"color: blue\" />",
|
||||
"output": "<div style='color: blue;'/>",
|
||||
"xhtml": "<div style='color: blue;'></div>",
|
||||
"rexml": "<div style='color: blue;'></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "style_attr_end_with_space",
|
||||
"input": "<div style=\"color: blue \" />",
|
||||
"output": "<div style='color: blue ;'/>",
|
||||
"xhtml": "<div style='color: blue ;'></div>",
|
||||
"rexml": "<div style='color: blue ;'></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "style_attr_end_with_semicolon",
|
||||
"input": "<div style=\"color: blue;\" />",
|
||||
"output": "<div style='color: blue;'/>",
|
||||
"xhtml": "<div style='color: blue;'></div>",
|
||||
"rexml": "<div style='color: blue;'></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "style_attr_end_with_semicolon_space",
|
||||
"input": "<div style=\"color: blue; \" />",
|
||||
"output": "<div style='color: blue;'/>",
|
||||
"xhtml": "<div style='color: blue;'></div>",
|
||||
"rexml": "<div style='color: blue;'></div>"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "attributes_with_embedded_quotes",
|
||||
"input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
|
||||
"output": "<img src='doesntexist.jpg"'onerror="alert(1)'/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
},
|
||||
|
||||
{
|
||||
"name": "attributes_with_embedded_quotes_II",
|
||||
"input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
|
||||
"output": "<img src='notthere.jpg""onerror="alert(2)'/>",
|
||||
"rexml": "Ill-formed XHTML!"
|
||||
}
|
||||
]
|
||||
125
html5lib/tests/testdata/serializer/core.test
vendored
Normal file
125
html5lib/tests/testdata/serializer/core.test
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
{"tests": [
|
||||
|
||||
{"description": "proper attribute value escaping",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test \"with\" ""}]]],
|
||||
"expected": ["<span title='test \"with\" &quot;'>"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value non-quoting",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo"}]]],
|
||||
"expected": ["<span title=foo>"],
|
||||
"xhtml": ["<span title=\"foo\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value non-quoting (with <)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo<bar"}]]],
|
||||
"expected": ["<span title=foo<bar>"],
|
||||
"xhtml": ["<span title=\"foo<bar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with =)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo=bar"}]]],
|
||||
"expected": ["<span title=\"foo=bar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with >)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo>bar"}]]],
|
||||
"expected": ["<span title=\"foo>bar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with \")",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\"bar"}]]],
|
||||
"expected": ["<span title='foo\"bar'>"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with ')",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar"}]]],
|
||||
"expected": ["<span title=\"foo'bar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with both \" and ')",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar\"baz"}]]],
|
||||
"expected": ["<span title=\"foo'bar"baz\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with space)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo bar"}]]],
|
||||
"expected": ["<span title=\"foo bar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with tab)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\tbar"}]]],
|
||||
"expected": ["<span title=\"foo\tbar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with LF)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\nbar"}]]],
|
||||
"expected": ["<span title=\"foo\nbar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with CR)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\rbar"}]]],
|
||||
"expected": ["<span title=\"foo\rbar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value non-quoting (with linetab)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Bbar"}]]],
|
||||
"expected": ["<span title=foo\u000Bbar>"],
|
||||
"xhtml": ["<span title=\"foo\u000Bbar\">"]
|
||||
},
|
||||
|
||||
{"description": "proper attribute value quoting (with form feed)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Cbar"}]]],
|
||||
"expected": ["<span title=\"foo\u000Cbar\">"]
|
||||
},
|
||||
|
||||
{"description": "void element (as EmptyTag token)",
|
||||
"input": [["EmptyTag", "img", {}]],
|
||||
"expected": ["<img>"],
|
||||
"xhtml": ["<img />"]
|
||||
},
|
||||
|
||||
{"description": "void element (as StartTag token)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "img", {}]],
|
||||
"expected": ["<img>"],
|
||||
"xhtml": ["<img />"]
|
||||
},
|
||||
|
||||
{"description": "doctype in error",
|
||||
"input": [["Doctype", "foo"]],
|
||||
"expected": ["<!DOCTYPE foo>"]
|
||||
},
|
||||
|
||||
{"description": "character data",
|
||||
"options": {"encoding":"utf-8"},
|
||||
"input": [["Characters", "a<b>c&d"]],
|
||||
"expected": ["a<b>c&d"]
|
||||
},
|
||||
|
||||
{"description": "rcdata",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
|
||||
"expected": ["<script>a<b>c&d"],
|
||||
"xhtml": ["<script>a<b>c&d"]
|
||||
},
|
||||
|
||||
{"description": "doctype",
|
||||
"input": [["Doctype", "HTML"]],
|
||||
"expected": ["<!DOCTYPE HTML>"]
|
||||
},
|
||||
|
||||
{"description": "HTML 4.01 DOCTYPE",
|
||||
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"]],
|
||||
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"]
|
||||
},
|
||||
|
||||
{"description": "HTML 4.01 DOCTYPE without system identifer",
|
||||
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN"]],
|
||||
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"]
|
||||
},
|
||||
|
||||
{"description": "IBM DOCTYPE without public identifer",
|
||||
"input": [["Doctype", "html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]],
|
||||
"expected": ["<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"]
|
||||
}
|
||||
|
||||
]}
|
||||
66
html5lib/tests/testdata/serializer/injectmeta.test
vendored
Normal file
66
html5lib/tests/testdata/serializer/injectmeta.test
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
{"tests": [
|
||||
|
||||
{"description": "no encoding",
|
||||
"options": {"inject_meta_charset": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": [""],
|
||||
"xhtml": ["<head></head>"]
|
||||
},
|
||||
|
||||
{"description": "empytag head",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta charset=utf-8>"],
|
||||
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/title",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml","title",{}], ["Characters", "foo"],["EndTag", "http://www.w3.org/1999/xhtml", "title"], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta charset=utf-8><title>foo</title>"],
|
||||
"xhtml": ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/meta-charset",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta charset=utf-8>"],
|
||||
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/ two meta-charset",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
|
||||
"xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/robots",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta charset=utf-8><meta content=noindex name=robots>"],
|
||||
"xhtml": ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/robots & charset",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta content=noindex name=robots><meta charset=utf-8>"],
|
||||
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/ charset in http-equiv content-type",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
|
||||
"xhtml": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
|
||||
},
|
||||
|
||||
{"description": "head w/robots & charset in http-equiv content-type",
|
||||
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": ["<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
|
||||
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
|
||||
}
|
||||
|
||||
]}
|
||||
965
html5lib/tests/testdata/serializer/optionaltags.test
vendored
Normal file
965
html5lib/tests/testdata/serializer/optionaltags.test
vendored
Normal file
@@ -0,0 +1,965 @@
|
||||
{"tests": [
|
||||
|
||||
{"description": "html start-tag followed by text, with attributes",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", [{"namespace": null, "name": "lang", "value": "en"}]], ["Characters", "foo"]],
|
||||
"expected": ["<html lang=en>foo"]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "html start-tag followed by comment",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Comment", "foo"]],
|
||||
"expected": ["<html><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "html start-tag followed by space character",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", " foo"]],
|
||||
"expected": ["<html> foo"]
|
||||
},
|
||||
|
||||
{"description": "html start-tag followed by text",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "html start-tag followed by start-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "html start-tag followed by end-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "html start-tag at EOF (shouldn't ever happen?!)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "html end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Comment", "foo"]],
|
||||
"expected": ["</html><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "html end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", " foo"]],
|
||||
"expected": ["</html> foo"]
|
||||
},
|
||||
|
||||
{"description": "html end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "html end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "html end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "html end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "head start-tag followed by comment",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Comment", "foo"]],
|
||||
"expected": ["<head><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "head start-tag followed by space character",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", " foo"]],
|
||||
"expected": ["<head> foo"]
|
||||
},
|
||||
|
||||
{"description": "head start-tag followed by text",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", "foo"]],
|
||||
"expected": ["<head>foo"]
|
||||
},
|
||||
|
||||
{"description": "head start-tag followed by start-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "head start-tag followed by end-tag (shouldn't ever happen?!)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["<head></foo>", "</foo>"]
|
||||
},
|
||||
|
||||
{"description": "empty head element",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
{"description": "head start-tag followed by empty-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "head start-tag at EOF (shouldn't ever happen?!)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}]],
|
||||
"expected": ["<head>", ""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "head end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Comment", "foo"]],
|
||||
"expected": ["</head><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "head end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", " foo"]],
|
||||
"expected": ["</head> foo"]
|
||||
},
|
||||
|
||||
{"description": "head end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "head end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "head end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "head end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "body start-tag followed by comment",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Comment", "foo"]],
|
||||
"expected": ["<body><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "body start-tag followed by space character",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", " foo"]],
|
||||
"expected": ["<body> foo"]
|
||||
},
|
||||
|
||||
{"description": "body start-tag followed by text",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "body start-tag followed by start-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "body start-tag followed by end-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "body start-tag at EOF (shouldn't ever happen?!)",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "body end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Comment", "foo"]],
|
||||
"expected": ["</body><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "body end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", " foo"]],
|
||||
"expected": ["</body> foo"]
|
||||
},
|
||||
|
||||
{"description": "body end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "body end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "body end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "body end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "li end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Comment", "foo"]],
|
||||
"expected": ["</li><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", " foo"]],
|
||||
"expected": ["</li> foo"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", "foo"]],
|
||||
"expected": ["</li>foo"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</li><foo>"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag followed by li start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "li", {}]],
|
||||
"expected": ["<li>"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "li end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "dt end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Comment", "foo"]],
|
||||
"expected": ["</dt><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", " foo"]],
|
||||
"expected": ["</dt> foo"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", "foo"]],
|
||||
"expected": ["</dt>foo"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</dt><foo>"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by dt start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
|
||||
"expected": ["<dt>"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by dd start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
|
||||
"expected": ["<dd>"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</dt></foo>"]
|
||||
},
|
||||
|
||||
{"description": "dt end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"]],
|
||||
"expected": ["</dt>"]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "dd end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Comment", "foo"]],
|
||||
"expected": ["</dd><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", " foo"]],
|
||||
"expected": ["</dd> foo"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", "foo"]],
|
||||
"expected": ["</dd>foo"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</dd><foo>"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by dd start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
|
||||
"expected": ["<dd>"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by dt start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
|
||||
"expected": ["<dt>"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "dd end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "p end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Comment", "foo"]],
|
||||
"expected": ["</p><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", " foo"]],
|
||||
"expected": ["</p> foo"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", "foo"]],
|
||||
"expected": ["</p>foo"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</p><foo>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by address start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "address", {}]],
|
||||
"expected": ["<address>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by article start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "article", {}]],
|
||||
"expected": ["<article>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by aside start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "aside", {}]],
|
||||
"expected": ["<aside>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by blockquote start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "blockquote", {}]],
|
||||
"expected": ["<blockquote>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by datagrid start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "datagrid", {}]],
|
||||
"expected": ["<datagrid>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by dialog start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dialog", {}]],
|
||||
"expected": ["<dialog>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by dir start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dir", {}]],
|
||||
"expected": ["<dir>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by div start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
|
||||
"expected": ["<div>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by dl start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dl", {}]],
|
||||
"expected": ["<dl>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by fieldset start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "fieldset", {}]],
|
||||
"expected": ["<fieldset>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by footer start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "footer", {}]],
|
||||
"expected": ["<footer>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by form start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "form", {}]],
|
||||
"expected": ["<form>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h1 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h1", {}]],
|
||||
"expected": ["<h1>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h2 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h2", {}]],
|
||||
"expected": ["<h2>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h3 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h3", {}]],
|
||||
"expected": ["<h3>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h4 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h4", {}]],
|
||||
"expected": ["<h4>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h5 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h5", {}]],
|
||||
"expected": ["<h5>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by h6 start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h6", {}]],
|
||||
"expected": ["<h6>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by header start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "header", {}]],
|
||||
"expected": ["<header>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by hr empty-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EmptyTag", "hr", {}]],
|
||||
"expected": ["<hr>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by menu start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "menu", {}]],
|
||||
"expected": ["<menu>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by nav start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "nav", {}]],
|
||||
"expected": ["<nav>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by ol start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ol", {}]],
|
||||
"expected": ["<ol>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by p start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "p", {}]],
|
||||
"expected": ["<p>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by pre start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}]],
|
||||
"expected": ["<pre>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by section start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "section", {}]],
|
||||
"expected": ["<section>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by table start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "table", {}]],
|
||||
"expected": ["<table>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by ul start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ul", {}]],
|
||||
"expected": ["<ul>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "p end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "optgroup end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Comment", "foo"]],
|
||||
"expected": ["</optgroup><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", " foo"]],
|
||||
"expected": ["</optgroup> foo"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", "foo"]],
|
||||
"expected": ["</optgroup>foo"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</optgroup><foo>"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag followed by optgroup start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
|
||||
"expected": ["<optgroup>"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "optgroup end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "option end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Comment", "foo"]],
|
||||
"expected": ["</option><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", " foo"]],
|
||||
"expected": ["</option> foo"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", "foo"]],
|
||||
"expected": ["</option>foo"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by optgroup start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
|
||||
"expected": ["<optgroup>"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</option><foo>"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by option start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "option", {}]],
|
||||
"expected": ["<option>"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "option end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "colgroup start-tag followed by comment",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Comment", "foo"]],
|
||||
"expected": ["<colgroup><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "colgroup start-tag followed by space character",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", " foo"]],
|
||||
"expected": ["<colgroup> foo"]
|
||||
},
|
||||
|
||||
{"description": "colgroup start-tag followed by text",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", "foo"]],
|
||||
"expected": ["<colgroup>foo"]
|
||||
},
|
||||
|
||||
{"description": "colgroup start-tag followed by start-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<colgroup><foo>"]
|
||||
},
|
||||
|
||||
{"description": "first colgroup in a table with a col child",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EmptyTag", "col", {}]],
|
||||
"expected": ["<table><col>"]
|
||||
},
|
||||
|
||||
{"description": "colgroup with a col child, following another colgroup",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "col", {}]],
|
||||
"expected": ["</colgroup><col>", "<colgroup><col>"]
|
||||
},
|
||||
|
||||
{"description": "colgroup start-tag followed by end-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["<colgroup></foo>"]
|
||||
},
|
||||
|
||||
{"description": "colgroup start-tag at EOF",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}]],
|
||||
"expected": ["<colgroup>"]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "colgroup end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Comment", "foo"]],
|
||||
"expected": ["</colgroup><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "colgroup end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", " foo"]],
|
||||
"expected": ["</colgroup> foo"]
|
||||
},
|
||||
|
||||
{"description": "colgroup end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", "foo"]],
|
||||
"expected": ["foo"]
|
||||
},
|
||||
|
||||
{"description": "colgroup end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<foo>"]
|
||||
},
|
||||
|
||||
{"description": "colgroup end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "colgroup end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "thead end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Comment", "foo"]],
|
||||
"expected": ["</thead><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", " foo"]],
|
||||
"expected": ["</thead> foo"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", "foo"]],
|
||||
"expected": ["</thead>foo"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</thead><foo>"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by tbody start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
|
||||
"expected": ["<tbody>"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by tfoot start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
|
||||
"expected": ["<tfoot>"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</thead></foo>"]
|
||||
},
|
||||
|
||||
{"description": "thead end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"]],
|
||||
"expected": ["</thead>"]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "tbody start-tag followed by comment",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Comment", "foo"]],
|
||||
"expected": ["<tbody><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "tbody start-tag followed by space character",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", " foo"]],
|
||||
"expected": ["<tbody> foo"]
|
||||
},
|
||||
|
||||
{"description": "tbody start-tag followed by text",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", "foo"]],
|
||||
"expected": ["<tbody>foo"]
|
||||
},
|
||||
|
||||
{"description": "tbody start-tag followed by start-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["<tbody><foo>"]
|
||||
},
|
||||
|
||||
{"description": "first tbody in a table with a tr child",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
|
||||
"expected": ["<table><tr>"]
|
||||
},
|
||||
|
||||
{"description": "tbody with a tr child, following another tbody",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
|
||||
"expected": ["<tbody><tr>", "</tbody><tr>"]
|
||||
},
|
||||
|
||||
{"description": "tbody with a tr child, following a thead",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
|
||||
"expected": ["<tbody><tr>", "</thead><tr>"]
|
||||
},
|
||||
|
||||
{"description": "tbody with a tr child, following a tfoot",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
|
||||
"expected": ["<tbody><tr>", "</tfoot><tr>"]
|
||||
},
|
||||
|
||||
{"description": "tbody start-tag followed by end-tag",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["<tbody></foo>"]
|
||||
},
|
||||
|
||||
{"description": "tbody start-tag at EOF",
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
|
||||
"expected": ["<tbody>"]
|
||||
},
|
||||
|
||||
|
||||
|
||||
{"description": "tbody end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Comment", "foo"]],
|
||||
"expected": ["</tbody><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", " foo"]],
|
||||
"expected": ["</tbody> foo"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", "foo"]],
|
||||
"expected": ["</tbody>foo"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</tbody><foo>"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by tbody start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
|
||||
"expected": ["<tbody>", "</tbody>"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by tfoot start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
|
||||
"expected": ["<tfoot>"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "tbody end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "tfoot end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Comment", "foo"]],
|
||||
"expected": ["</tfoot><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", " foo"]],
|
||||
"expected": ["</tfoot> foo"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", "foo"]],
|
||||
"expected": ["</tfoot>foo"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</tfoot><foo>"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag followed by tbody start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
|
||||
"expected": ["<tbody>", "</tfoot>"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "tfoot end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "tr end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Comment", "foo"]],
|
||||
"expected": ["</tr><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", " foo"]],
|
||||
"expected": ["</tr> foo"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", "foo"]],
|
||||
"expected": ["</tr>foo"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</tr><foo>"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag followed by tr start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
|
||||
"expected": ["<tr>", "</tr>"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "tr end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "td end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Comment", "foo"]],
|
||||
"expected": ["</td><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", " foo"]],
|
||||
"expected": ["</td> foo"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", "foo"]],
|
||||
"expected": ["</td>foo"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</td><foo>"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by td start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
|
||||
"expected": ["<td>", "</td>"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by th start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
|
||||
"expected": ["<th>", "</td>"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "td end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"]],
|
||||
"expected": [""]
|
||||
},
|
||||
|
||||
|
||||
|
||||
|
||||
{"description": "th end-tag followed by comment",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Comment", "foo"]],
|
||||
"expected": ["</th><!--foo-->"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by space character",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", " foo"]],
|
||||
"expected": ["</th> foo"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by text",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", "foo"]],
|
||||
"expected": ["</th>foo"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
|
||||
"expected": ["</th><foo>"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by th start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
|
||||
"expected": ["<th>", "</th>"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by td start-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
|
||||
"expected": ["<td>", "</th>"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag followed by end-tag",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
|
||||
"expected": ["</foo>"]
|
||||
},
|
||||
|
||||
{"description": "th end-tag at EOF",
|
||||
"input": [["EndTag", "http://www.w3.org/1999/xhtml" , "th"]],
|
||||
"expected": [""]
|
||||
}
|
||||
|
||||
]}
|
||||
60
html5lib/tests/testdata/serializer/options.test
vendored
Normal file
60
html5lib/tests/testdata/serializer/options.test
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
{"tests":[
|
||||
|
||||
{"description": "quote_char=\"'\"",
|
||||
"options": {"quote_char": "'"},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test 'with' quote_char"}]]],
|
||||
"expected": ["<span title='test 'with' quote_char'>"]
|
||||
},
|
||||
|
||||
{"description": "quote_attr_values=true",
|
||||
"options": {"quote_attr_values": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "button", [{"namespace": null, "name": "disabled", "value" :"disabled"}]]],
|
||||
"expected": ["<button disabled>"],
|
||||
"xhtml": ["<button disabled=\"disabled\">"]
|
||||
},
|
||||
|
||||
{"description": "quote_attr_values=true with irrelevant",
|
||||
"options": {"quote_attr_values": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
|
||||
"expected": ["<div irrelevant>"],
|
||||
"xhtml": ["<div irrelevant=\"irrelevant\">"]
|
||||
},
|
||||
|
||||
{"description": "use_trailing_solidus=true with void element",
|
||||
"options": {"use_trailing_solidus": true},
|
||||
"input": [["EmptyTag", "img", {}]],
|
||||
"expected": ["<img />"]
|
||||
},
|
||||
|
||||
{"description": "use_trailing_solidus=true with non-void element",
|
||||
"options": {"use_trailing_solidus": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
|
||||
"expected": ["<div>"]
|
||||
},
|
||||
|
||||
{"description": "minimize_boolean_attributes=false",
|
||||
"options": {"minimize_boolean_attributes": false},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
|
||||
"expected": ["<div irrelevant=irrelevant>"],
|
||||
"xhtml": ["<div irrelevant=\"irrelevant\">"]
|
||||
},
|
||||
|
||||
{"description": "minimize_boolean_attributes=false with empty value",
|
||||
"options": {"minimize_boolean_attributes": false},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :""}]]],
|
||||
"expected": ["<div irrelevant=\"\">"]
|
||||
},
|
||||
|
||||
{"description": "escape less than signs in attribute values",
|
||||
"options": {"escape_lt_in_attrs": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "a", [{"namespace": null, "name": "title", "value": "a<b>c&d"}]]],
|
||||
"expected": ["<a title=\"a<b>c&d\">"]
|
||||
},
|
||||
|
||||
{"description": "rcdata",
|
||||
"options": {"escape_rcdata": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
|
||||
"expected": ["<script>a<b>c&d"]
|
||||
}
|
||||
|
||||
]}
|
||||
51
html5lib/tests/testdata/serializer/whitespace.test
vendored
Normal file
51
html5lib/tests/testdata/serializer/whitespace.test
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
{"tests": [
|
||||
|
||||
{"description": "bare text with leading spaces",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["Characters", "\t\r\n\u000C foo"]],
|
||||
"expected": [" foo"]
|
||||
},
|
||||
|
||||
{"description": "bare text with trailing spaces",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["Characters", "foo \t\r\n\u000C"]],
|
||||
"expected": ["foo "]
|
||||
},
|
||||
|
||||
{"description": "bare text with inner spaces",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["Characters", "foo \t\r\n\u000C bar"]],
|
||||
"expected": ["foo bar"]
|
||||
},
|
||||
|
||||
{"description": "text within <pre>",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
|
||||
"expected": ["<pre>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</pre>"]
|
||||
},
|
||||
|
||||
{"description": "text within <pre>, with inner markup",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C fo"], ["StartTag", "http://www.w3.org/1999/xhtml", "span", {}], ["Characters", "o \t\r\n\u000C b"], ["EndTag", "http://www.w3.org/1999/xhtml", "span"], ["Characters", "ar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
|
||||
"expected": ["<pre>\t\r\n\u000C fo<span>o \t\r\n\u000C b</span>ar \t\r\n\u000C</pre>"]
|
||||
},
|
||||
|
||||
{"description": "text within <textarea>",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "textarea", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "textarea"]],
|
||||
"expected": ["<textarea>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</textarea>"]
|
||||
},
|
||||
|
||||
{"description": "text within <script>",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "script"]],
|
||||
"expected": ["<script>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</script>"]
|
||||
},
|
||||
|
||||
{"description": "text within <style>",
|
||||
"options": {"strip_whitespace": true},
|
||||
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "style"]],
|
||||
"expected": ["<style>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</style>"]
|
||||
}
|
||||
|
||||
]}
|
||||
43
html5lib/tests/testdata/sniffer/htmlOrFeed.json
vendored
Normal file
43
html5lib/tests/testdata/sniffer/htmlOrFeed.json
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
[
|
||||
{"type": "text/html", "input": ""},
|
||||
{"type": "text/html", "input": "<!---->"},
|
||||
{"type": "text/html", "input": "<!--asdfaslkjdf;laksjdf as;dkfjsd-->"},
|
||||
{"type": "text/html", "input": "<!"},
|
||||
{"type": "text/html", "input": "\t"},
|
||||
{"type": "text/html", "input": "<!>"},
|
||||
{"type": "text/html", "input": "<?"},
|
||||
{"type": "text/html", "input": "<??>"},
|
||||
{"type": "application/rss+xml", "input": "<rss"},
|
||||
{"type": "application/atom+xml", "input": "<feed"},
|
||||
{"type": "text/html", "input": "<html"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<html><head>\n<title>302 Found</title>\n</head><body>\n<h1>Found</h1>\n<p>The document has moved <a href=\"http://feeds.feedburner.com/gofug\">here</a>.</p>\n</body></html>\n"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\r\n<HTML><HEAD>\r\n <link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/289619328/feed.css\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/431602649/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/382549546/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/314618017/feed.css\" /><META http-equiv=\"expires\" content="},
|
||||
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\r\n<html>\r\n<head>\r\n<title>Xiaxue - Chicken pie blogger.</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><style type=\"text/css\">\r\n<style type=\"text/css\">\r\n<!--\r\nbody {\r\n background-color: #FFF2F2;\r\n}\r\n.style1 {font-family: Georgia, \"Times New Roman\", Times, serif}\r\n.style2 {\r\n color: #8a567c;\r\n font-size: 14px;\r\n font-family: Georgia, \"Times New Roman\", Times, serif;\r\n}\r"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head> \r\n<title>Google Operating System</title>\r\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"Description\" content=\"Unofficial news and tips about Google. A blog that watches Google's latest developments and the attempts to move your operating system online.\" />\r\n<meta name=\"generator\" c"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>Assimilated Press</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Assimilated Press - Atom\" href=\"http://assimila"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>PostSecret</title>\r\n<META name=\"keywords\" Content=\"secrets, postcard, secret, postcards, postsecret, postsecrets,online confessional, post secret, post secrets, artomatic, post a secret\"><META name=\"discription\" Content=\"See a Secret...Share a Secret\"> <meta http-equiv=\"Content-Type\" content=\"te"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns='http://www.w3.org/1999/xhtml' xmlns:b='http://www.google.com/2005/gml/b' xmlns:data='http://www.google.com/2005/gml/data' xmlns:expr='http://www.google.com/2005/gml/expr'>\n <head>\n \n <meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/>\n <meta content='true' name='MSSmartTagsPreventParsing'/>\n <meta content='blogger' name='generator'/>\n <link rel=\"alternate\" typ"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\">\n<head profile=\"http://gmpg.org/xfn/11\"> \n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /> \n<title> CMS Lever</title><link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"http://s.wordpress.com/wp-content/themes/pub/twenty-eight/2813.css\"/>\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" h"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> Park Avenue Peerage</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://parkavenuepeerage.wordpress.com/feed/\" />\t<link rel=\"pingback\" href="},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> \u884c\u96f2\u6d41\u6c34 -like a floating clouds and running water-</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://shw4.wordpress.com/feed/\" />\t<li"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Go Fug Yourself</title><link rel=\"stylesheet\" href=\"http://gofugyourself.typepad.com/go_fug_yourself/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Atom\" "},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head profile=\"http://gmpg.org/xfn/11\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /><title> Ladies…</title><meta name=\"generator\" content=\"WordPress.com\" /> <!-- leave this for stats --><link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/default/style.css?1\" type=\"tex"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n <title>The Sartorialist</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"The Sartorialist - Atom\" href=\"http://thesartorialist.blogspot"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Creating Passionate Users</title><link rel=\"stylesheet\" href=\"http://headrush.typepad.com/creating_passionate_users/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n <meta name=\"keywords\" content=\"marketing, blog, seth, ideas, respect, permission\" />\n <meta name=\"description\" content=\"Seth Godin's riffs on marketing, respect, and the "},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n \n <meta name=\"description\" content=\" Western Civilization hangs in the balance. This blog is part of the solution,the cure. Get your heads out of the sand and Fight the G"},
|
||||
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\" />\n<title> From Under the Rotunda</title>\n<link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/pub/andreas04/style.css\" type=\"text/css\""},
|
||||
{"type": "application/atom+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href=\"http://www.blogger.com/styles/atom.css\" type=\"text/css\"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-10861780</id><updated>2007-07-27T12:38:50.888-07:00</updated><title type='text'>Official Google Blog</title><link rel='alternate' type='text/html' href='http://googleblog.blogspot.com/'/><link rel='next' type='application/atom+xml' href='http://googleblog.blogs"},
|
||||
{"type": "application/rss+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><rss xmlns:atom='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' version='2.0'><channel><atom:id>tag:blogger.com,1999:blog-10861780</atom:id><lastBuildDate>Fri, 27 Jul 2007 19:38:50 +0000</lastBuildDate><title>Official Google Blog</title><description/><link>http://googleblog.blogspot.com/</link><managingEditor>Eric Case</managingEditor><generator>Blogger</generator><openSearch:totalResults>729</openSearch:totalResults><openSearc"},
|
||||
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>From Under the Rotunda</title>\n\t<link>http://dannybernardi.wordpress.com</link>\n\t<description>The Monographs of Danny Ber"},
|
||||
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>CMS Lever</title>\n\t<link>http://kanaguri.wordpress.com</link>\n\t<description>CMS\u306e\u6c17\u306b\u306a\u3063\u305f\u3053\u3068</description>\n\t<pubDate>Wed, 18 Jul 2007 21:26:22 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>ja</languag"},
|
||||
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\">\n <title>Atlas Shrugs</title>\n <link rel=\"self\" type=\"application/atom+xml\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/atom.xml\" />\n <link rel=\"alternate\" type=\"text/html\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/\" />\n <id>tag:typepad.com,2003:weblog-132946</id>\n <updated>2007-08-15T16:07:34-04"},
|
||||
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Creating Passionate Users</title>\r\n "},
|
||||
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Seth's Blog</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://sethgodin.typepad.com/seths_blog/\" />\r\n <link rel=\"s"},
|
||||
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:openSearch=\"http://a9.com/-/spec/opensearchrss/1.0/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\"><id>tag:blogger.com,1999:blog-32454861</id><updated>2007-07-31T21:44:09.867+02:00</upd"},
|
||||
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atomfull.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://purl.org/atom/ns#\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"0.3\">\r\n <title>Go Fug Yourself</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://go"},
|
||||
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/rss2full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><rss xmlns:creativeCommons=\"http://backend.userland.com/creativeCommonsRssModule\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"2.0\"><channel><title>Google Operating System</title><link>http://googlesystem.blogspot.com/</link>"},
|
||||
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>Nunublog</title>\n\t<link>http://nunubh.wordpress.com</link>\n\t<description>Just Newbie Blog!</description>\n\t<pubDate>Mon, 09 Jul 2007 18:54:09 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>id</language>\n\t\t\t<item>\n\t\t<ti"},
|
||||
{"type": "text/html", "input": "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<HEAD>\r\n<TITLE>Design*Sponge</TITLE><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Design*Sponge - Atom\" href=\"http://designsponge.blogspot.com/feeds/posts/default\" />\r\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Design*Sponge - RSS\" href="},
|
||||
{"type": "text/html", "input": "<HTML>\n<HEAD>\n<TITLE>Moved Temporarily</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"#FFFFFF\" TEXT=\"#000000\">\n<H1>Moved Temporarily</H1>\nThe document has moved <A HREF=\"http://feeds.feedburner.com/thesecretdiaryofstevejobs\">here</A>.\n</BODY>\n</HTML>\n"}
|
||||
]
|
||||
75
html5lib/tests/testdata/tokenizer/contentModelFlags.test
vendored
Normal file
75
html5lib/tests/testdata/tokenizer/contentModelFlags.test
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"PLAINTEXT content model flag",
|
||||
"initialStates":["PLAINTEXT state"],
|
||||
"lastStartTag":"plaintext",
|
||||
"input":"<head>&body;",
|
||||
"output":[["Character", "<head>&body;"]]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp>",
|
||||
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xMp>",
|
||||
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp ",
|
||||
"output":[["Character", "foo"], "ParseError"]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp",
|
||||
"output":[["Character", "foo</xmp"]]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp/",
|
||||
"output":[["Character", "foo"], "ParseError"]},
|
||||
|
||||
{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp<",
|
||||
"output":[["Character", "foo</xmp<"]]},
|
||||
|
||||
{"description":"End tag with incorrect name in RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</foo>bar</xmp>",
|
||||
"output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</foo>bar</xmpaar>",
|
||||
"output":[["Character", "</foo>bar</xmpaar>"]]},
|
||||
|
||||
{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo</xmp></baz>",
|
||||
"output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
|
||||
|
||||
{"description":"RAWTEXT w/ something looking like an entity",
|
||||
"initialStates":["RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"&foo;",
|
||||
"output":[["Character", "&foo;"]]},
|
||||
|
||||
{"description":"RCDATA w/ an entity",
|
||||
"initialStates":["RCDATA state"],
|
||||
"lastStartTag":"textarea",
|
||||
"input":"<",
|
||||
"output":[["Character", "<"]]}
|
||||
|
||||
]}
|
||||
90
html5lib/tests/testdata/tokenizer/domjs.test
vendored
Normal file
90
html5lib/tests/testdata/tokenizer/domjs.test
vendored
Normal file
@@ -0,0 +1,90 @@
|
||||
{
|
||||
"tests": [
|
||||
{
|
||||
"description":"CR in bogus comment state",
|
||||
"input":"<?\u000d",
|
||||
"output":["ParseError", ["Comment", "?\u000a"]]
|
||||
},
|
||||
{
|
||||
"description":"CRLF in bogus comment state",
|
||||
"input":"<?\u000d\u000a",
|
||||
"output":["ParseError", ["Comment", "?\u000a"]]
|
||||
},
|
||||
{
|
||||
"description":"NUL in RCDATA and RAWTEXT",
|
||||
"doubleEscaped":true,
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"input":"\\u0000",
|
||||
"output":["ParseError", ["Character", "\\uFFFD"]]
|
||||
},
|
||||
{
|
||||
"description":"skip first BOM but not later ones",
|
||||
"input":"\uFEFFfoo\uFEFFbar",
|
||||
"output":[["Character", "foo\uFEFFbar"]]
|
||||
},
|
||||
{
|
||||
"description":"Non BMP-charref in in RCDATA",
|
||||
"initialStates":["RCDATA state"],
|
||||
"input":"≂̸",
|
||||
"output":[["Character", "\u2242\u0338"]]
|
||||
},
|
||||
{
|
||||
"description":"Bad charref in in RCDATA",
|
||||
"initialStates":["RCDATA state"],
|
||||
"input":"&NotEqualTild;",
|
||||
"output":["ParseError", ["Character", "&NotEqualTild;"]]
|
||||
},
|
||||
{
|
||||
"description":"lowercase endtags in RCDATA and RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</XMP>",
|
||||
"output":[["EndTag","xmp"]]
|
||||
},
|
||||
{
|
||||
"description":"bad endtag in RCDATA and RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</ XMP>",
|
||||
"output":[["Character","</ XMP>"]]
|
||||
},
|
||||
{
|
||||
"description":"bad endtag in RCDATA and RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</xm>",
|
||||
"output":[["Character","</xm>"]]
|
||||
},
|
||||
{
|
||||
"description":"bad endtag in RCDATA and RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</xm ",
|
||||
"output":[["Character","</xm "]]
|
||||
},
|
||||
{
|
||||
"description":"bad endtag in RCDATA and RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"</xm/",
|
||||
"output":[["Character","</xm/"]]
|
||||
},
|
||||
{
|
||||
"description":"Non BMP-charref in attribute",
|
||||
"input":"<p id=\"≂̸\">",
|
||||
"output":[["StartTag", "p", {"id":"\u2242\u0338"}]]
|
||||
},
|
||||
{
|
||||
"description":"--!NUL in comment ",
|
||||
"doubleEscaped":true,
|
||||
"input":"<!----!\\u0000-->",
|
||||
"output":["ParseError", ["Comment", "--!\\uFFFD"]]
|
||||
},
|
||||
{
|
||||
"description":"space EOF after doctype ",
|
||||
"input":"<!DOCTYPE html ",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null , false]]
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
283
html5lib/tests/testdata/tokenizer/entities.test
vendored
Normal file
283
html5lib/tests/testdata/tokenizer/entities.test
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
{"tests": [
|
||||
|
||||
{"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.",
|
||||
"input":"<h a='¬i;'>",
|
||||
"output": ["ParseError", ["StartTag", "h", {"a": "¬i;"}]]},
|
||||
|
||||
{"description": "Entity name followed by the equals sign in an attribute value.",
|
||||
"input":"<h a='&lang='>",
|
||||
"output": ["ParseError", ["StartTag", "h", {"a": "&lang="}]]},
|
||||
|
||||
{"description": "CR as numeric entity",
|
||||
"input":"
",
|
||||
"output": ["ParseError", ["Character", "\r"]]},
|
||||
|
||||
{"description": "CR as hexadecimal numeric entity",
|
||||
"input":"
",
|
||||
"output": ["ParseError", ["Character", "\r"]]},
|
||||
|
||||
{"description": "Windows-1252 EURO SIGN numeric entity.",
|
||||
"input":"€",
|
||||
"output": ["ParseError", ["Character", "\u20AC"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u0081"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK numeric entity.",
|
||||
"input":"‚",
|
||||
"output": ["ParseError", ["Character", "\u201A"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK numeric entity.",
|
||||
"input":"ƒ",
|
||||
"output": ["ParseError", ["Character", "\u0192"]]},
|
||||
|
||||
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK numeric entity.",
|
||||
"input":"„",
|
||||
"output": ["ParseError", ["Character", "\u201E"]]},
|
||||
|
||||
{"description": "Windows-1252 HORIZONTAL ELLIPSIS numeric entity.",
|
||||
"input":"…",
|
||||
"output": ["ParseError", ["Character", "\u2026"]]},
|
||||
|
||||
{"description": "Windows-1252 DAGGER numeric entity.",
|
||||
"input":"†",
|
||||
"output": ["ParseError", ["Character", "\u2020"]]},
|
||||
|
||||
{"description": "Windows-1252 DOUBLE DAGGER numeric entity.",
|
||||
"input":"‡",
|
||||
"output": ["ParseError", ["Character", "\u2021"]]},
|
||||
|
||||
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT numeric entity.",
|
||||
"input":"ˆ",
|
||||
"output": ["ParseError", ["Character", "\u02C6"]]},
|
||||
|
||||
{"description": "Windows-1252 PER MILLE SIGN numeric entity.",
|
||||
"input":"‰",
|
||||
"output": ["ParseError", ["Character", "\u2030"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON numeric entity.",
|
||||
"input":"Š",
|
||||
"output": ["ParseError", ["Character", "\u0160"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK numeric entity.",
|
||||
"input":"‹",
|
||||
"output": ["ParseError", ["Character", "\u2039"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE numeric entity.",
|
||||
"input":"Œ",
|
||||
"output": ["ParseError", ["Character", "\u0152"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u008D"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON numeric entity.",
|
||||
"input":"Ž",
|
||||
"output": ["ParseError", ["Character", "\u017D"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u008F"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u0090"]]},
|
||||
|
||||
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK numeric entity.",
|
||||
"input":"‘",
|
||||
"output": ["ParseError", ["Character", "\u2018"]]},
|
||||
|
||||
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK numeric entity.",
|
||||
"input":"’",
|
||||
"output": ["ParseError", ["Character", "\u2019"]]},
|
||||
|
||||
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK numeric entity.",
|
||||
"input":"“",
|
||||
"output": ["ParseError", ["Character", "\u201C"]]},
|
||||
|
||||
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK numeric entity.",
|
||||
"input":"”",
|
||||
"output": ["ParseError", ["Character", "\u201D"]]},
|
||||
|
||||
{"description": "Windows-1252 BULLET numeric entity.",
|
||||
"input":"•",
|
||||
"output": ["ParseError", ["Character", "\u2022"]]},
|
||||
|
||||
{"description": "Windows-1252 EN DASH numeric entity.",
|
||||
"input":"–",
|
||||
"output": ["ParseError", ["Character", "\u2013"]]},
|
||||
|
||||
{"description": "Windows-1252 EM DASH numeric entity.",
|
||||
"input":"—",
|
||||
"output": ["ParseError", ["Character", "\u2014"]]},
|
||||
|
||||
{"description": "Windows-1252 SMALL TILDE numeric entity.",
|
||||
"input":"˜",
|
||||
"output": ["ParseError", ["Character", "\u02DC"]]},
|
||||
|
||||
{"description": "Windows-1252 TRADE MARK SIGN numeric entity.",
|
||||
"input":"™",
|
||||
"output": ["ParseError", ["Character", "\u2122"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON numeric entity.",
|
||||
"input":"š",
|
||||
"output": ["ParseError", ["Character", "\u0161"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK numeric entity.",
|
||||
"input":"›",
|
||||
"output": ["ParseError", ["Character", "\u203A"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LIGATURE OE numeric entity.",
|
||||
"input":"œ",
|
||||
"output": ["ParseError", ["Character", "\u0153"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u009D"]]},
|
||||
|
||||
{"description": "Windows-1252 EURO SIGN hexadecimal numeric entity.",
|
||||
"input":"€",
|
||||
"output": ["ParseError", ["Character", "\u20AC"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u0081"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"‚",
|
||||
"output": ["ParseError", ["Character", "\u201A"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK hexadecimal numeric entity.",
|
||||
"input":"ƒ",
|
||||
"output": ["ParseError", ["Character", "\u0192"]]},
|
||||
|
||||
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"„",
|
||||
"output": ["ParseError", ["Character", "\u201E"]]},
|
||||
|
||||
{"description": "Windows-1252 HORIZONTAL ELLIPSIS hexadecimal numeric entity.",
|
||||
"input":"…",
|
||||
"output": ["ParseError", ["Character", "\u2026"]]},
|
||||
|
||||
{"description": "Windows-1252 DAGGER hexadecimal numeric entity.",
|
||||
"input":"†",
|
||||
"output": ["ParseError", ["Character", "\u2020"]]},
|
||||
|
||||
{"description": "Windows-1252 DOUBLE DAGGER hexadecimal numeric entity.",
|
||||
"input":"‡",
|
||||
"output": ["ParseError", ["Character", "\u2021"]]},
|
||||
|
||||
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT hexadecimal numeric entity.",
|
||||
"input":"ˆ",
|
||||
"output": ["ParseError", ["Character", "\u02C6"]]},
|
||||
|
||||
{"description": "Windows-1252 PER MILLE SIGN hexadecimal numeric entity.",
|
||||
"input":"‰",
|
||||
"output": ["ParseError", ["Character", "\u2030"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON hexadecimal numeric entity.",
|
||||
"input":"Š",
|
||||
"output": ["ParseError", ["Character", "\u0160"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"‹",
|
||||
"output": ["ParseError", ["Character", "\u2039"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE hexadecimal numeric entity.",
|
||||
"input":"Œ",
|
||||
"output": ["ParseError", ["Character", "\u0152"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u008D"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON hexadecimal numeric entity.",
|
||||
"input":"Ž",
|
||||
"output": ["ParseError", ["Character", "\u017D"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u008F"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u0090"]]},
|
||||
|
||||
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"‘",
|
||||
"output": ["ParseError", ["Character", "\u2018"]]},
|
||||
|
||||
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"’",
|
||||
"output": ["ParseError", ["Character", "\u2019"]]},
|
||||
|
||||
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"“",
|
||||
"output": ["ParseError", ["Character", "\u201C"]]},
|
||||
|
||||
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"”",
|
||||
"output": ["ParseError", ["Character", "\u201D"]]},
|
||||
|
||||
{"description": "Windows-1252 BULLET hexadecimal numeric entity.",
|
||||
"input":"•",
|
||||
"output": ["ParseError", ["Character", "\u2022"]]},
|
||||
|
||||
{"description": "Windows-1252 EN DASH hexadecimal numeric entity.",
|
||||
"input":"–",
|
||||
"output": ["ParseError", ["Character", "\u2013"]]},
|
||||
|
||||
{"description": "Windows-1252 EM DASH hexadecimal numeric entity.",
|
||||
"input":"—",
|
||||
"output": ["ParseError", ["Character", "\u2014"]]},
|
||||
|
||||
{"description": "Windows-1252 SMALL TILDE hexadecimal numeric entity.",
|
||||
"input":"˜",
|
||||
"output": ["ParseError", ["Character", "\u02DC"]]},
|
||||
|
||||
{"description": "Windows-1252 TRADE MARK SIGN hexadecimal numeric entity.",
|
||||
"input":"™",
|
||||
"output": ["ParseError", ["Character", "\u2122"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON hexadecimal numeric entity.",
|
||||
"input":"š",
|
||||
"output": ["ParseError", ["Character", "\u0161"]]},
|
||||
|
||||
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
|
||||
"input":"›",
|
||||
"output": ["ParseError", ["Character", "\u203A"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LIGATURE OE hexadecimal numeric entity.",
|
||||
"input":"œ",
|
||||
"output": ["ParseError", ["Character", "\u0153"]]},
|
||||
|
||||
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
|
||||
"input":"",
|
||||
"output": ["ParseError", ["Character", "\u009D"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN SMALL LETTER Z WITH CARON hexadecimal numeric entity.",
|
||||
"input":"ž",
|
||||
"output": ["ParseError", ["Character", "\u017E"]]},
|
||||
|
||||
{"description": "Windows-1252 LATIN CAPITAL LETTER Y WITH DIAERESIS hexadecimal numeric entity.",
|
||||
"input":"Ÿ",
|
||||
"output": ["ParseError", ["Character", "\u0178"]]},
|
||||
|
||||
{"description": "Decimal numeric entity followed by hex character a.",
|
||||
"input":"aa",
|
||||
"output": ["ParseError", ["Character", "aa"]]},
|
||||
|
||||
{"description": "Decimal numeric entity followed by hex character A.",
|
||||
"input":"aA",
|
||||
"output": ["ParseError", ["Character", "aA"]]},
|
||||
|
||||
{"description": "Decimal numeric entity followed by hex character f.",
|
||||
"input":"af",
|
||||
"output": ["ParseError", ["Character", "af"]]},
|
||||
|
||||
{"description": "Decimal numeric entity followed by hex character A.",
|
||||
"input":"aF",
|
||||
"output": ["ParseError", ["Character", "aF"]]}
|
||||
|
||||
]}
|
||||
33
html5lib/tests/testdata/tokenizer/escapeFlag.test
vendored
Normal file
33
html5lib/tests/testdata/tokenizer/escapeFlag.test
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"Commented close tag in RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo<!--</xmp>--></xmp>",
|
||||
"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"Bogus comment in RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo<!-->baz</xmp>",
|
||||
"output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo<!--></xmp><!-->baz</xmp>",
|
||||
"output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"Commented entities in RCDATA",
|
||||
"initialStates":["RCDATA state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":" & <!-- & --> & </xmp>",
|
||||
"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
|
||||
|
||||
{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
|
||||
"initialStates":["RCDATA state", "RAWTEXT state"],
|
||||
"lastStartTag":"xmp",
|
||||
"input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
|
||||
"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
|
||||
|
||||
]}
|
||||
44189
html5lib/tests/testdata/tokenizer/namedEntities.test
vendored
Normal file
44189
html5lib/tests/testdata/tokenizer/namedEntities.test
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1313
html5lib/tests/testdata/tokenizer/numericEntities.test
vendored
Normal file
1313
html5lib/tests/testdata/tokenizer/numericEntities.test
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7
html5lib/tests/testdata/tokenizer/pendingSpecChanges.test
vendored
Normal file
7
html5lib/tests/testdata/tokenizer/pendingSpecChanges.test
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"<!---- >",
|
||||
"input":"<!---- >",
|
||||
"output":["ParseError", "ParseError", ["Comment","-- >"]]}
|
||||
|
||||
]}
|
||||
196
html5lib/tests/testdata/tokenizer/test1.test
vendored
Normal file
196
html5lib/tests/testdata/tokenizer/test1.test
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"Correct Doctype lowercase",
|
||||
"input":"<!DOCTYPE html>",
|
||||
"output":[["DOCTYPE", "html", null, null, true]]},
|
||||
|
||||
{"description":"Correct Doctype uppercase",
|
||||
"input":"<!DOCTYPE HTML>",
|
||||
"output":[["DOCTYPE", "html", null, null, true]]},
|
||||
|
||||
{"description":"Correct Doctype mixed case",
|
||||
"input":"<!DOCTYPE HtMl>",
|
||||
"output":[["DOCTYPE", "html", null, null, true]]},
|
||||
|
||||
{"description":"Correct Doctype case with EOF",
|
||||
"input":"<!DOCTYPE HtMl",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Truncated doctype start",
|
||||
"input":"<!DOC>",
|
||||
"output":["ParseError", ["Comment", "DOC"]]},
|
||||
|
||||
{"description":"Doctype in error",
|
||||
"input":"<!DOCTYPE foo>",
|
||||
"output":[["DOCTYPE", "foo", null, null, true]]},
|
||||
|
||||
{"description":"Single Start Tag",
|
||||
"input":"<h>",
|
||||
"output":[["StartTag", "h", {}]]},
|
||||
|
||||
{"description":"Empty end tag",
|
||||
"input":"</>",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"Empty start tag",
|
||||
"input":"<>",
|
||||
"output":["ParseError", ["Character", "<>"]]},
|
||||
|
||||
{"description":"Start Tag w/attribute",
|
||||
"input":"<h a='b'>",
|
||||
"output":[["StartTag", "h", {"a":"b"}]]},
|
||||
|
||||
{"description":"Start Tag w/attribute no quotes",
|
||||
"input":"<h a=b>",
|
||||
"output":[["StartTag", "h", {"a":"b"}]]},
|
||||
|
||||
{"description":"Start/End Tag",
|
||||
"input":"<h></h>",
|
||||
"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
|
||||
|
||||
{"description":"Two unclosed start tags",
|
||||
"input":"<p>One<p>Two",
|
||||
"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
|
||||
|
||||
{"description":"End Tag w/attribute",
|
||||
"input":"<h></h a='b'>",
|
||||
"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
|
||||
|
||||
{"description":"Multiple atts",
|
||||
"input":"<h a='b' c='d'>",
|
||||
"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
|
||||
|
||||
{"description":"Multiple atts no space",
|
||||
"input":"<h a='b'c='d'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"b", "c":"d"}]]},
|
||||
|
||||
{"description":"Repeated attr",
|
||||
"input":"<h a='b' a='d'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
|
||||
|
||||
{"description":"Simple comment",
|
||||
"input":"<!--comment-->",
|
||||
"output":[["Comment", "comment"]]},
|
||||
|
||||
{"description":"Comment, Central dash no space",
|
||||
"input":"<!----->",
|
||||
"output":["ParseError", ["Comment", "-"]]},
|
||||
|
||||
{"description":"Comment, two central dashes",
|
||||
"input":"<!-- --comment -->",
|
||||
"output":["ParseError", ["Comment", " --comment "]]},
|
||||
|
||||
{"description":"Unfinished comment",
|
||||
"input":"<!--comment",
|
||||
"output":["ParseError", ["Comment", "comment"]]},
|
||||
|
||||
{"description":"Start of a comment",
|
||||
"input":"<!-",
|
||||
"output":["ParseError", ["Comment", "-"]]},
|
||||
|
||||
{"description":"Short comment",
|
||||
"input":"<!-->",
|
||||
"output":["ParseError", ["Comment", ""]]},
|
||||
|
||||
{"description":"Short comment two",
|
||||
"input":"<!--->",
|
||||
"output":["ParseError", ["Comment", ""]]},
|
||||
|
||||
{"description":"Short comment three",
|
||||
"input":"<!---->",
|
||||
"output":[["Comment", ""]]},
|
||||
|
||||
|
||||
{"description":"Ampersand EOF",
|
||||
"input":"&",
|
||||
"output":[["Character", "&"]]},
|
||||
|
||||
{"description":"Ampersand ampersand EOF",
|
||||
"input":"&&",
|
||||
"output":[["Character", "&&"]]},
|
||||
|
||||
{"description":"Ampersand space EOF",
|
||||
"input":"& ",
|
||||
"output":[["Character", "& "]]},
|
||||
|
||||
{"description":"Unfinished entity",
|
||||
"input":"&f",
|
||||
"output":["ParseError", ["Character", "&f"]]},
|
||||
|
||||
{"description":"Ampersand, number sign",
|
||||
"input":"&#",
|
||||
"output":["ParseError", ["Character", "&#"]]},
|
||||
|
||||
{"description":"Unfinished numeric entity",
|
||||
"input":"&#x",
|
||||
"output":["ParseError", ["Character", "&#x"]]},
|
||||
|
||||
{"description":"Entity with trailing semicolon (1)",
|
||||
"input":"I'm ¬it",
|
||||
"output":[["Character","I'm \u00ACit"]]},
|
||||
|
||||
{"description":"Entity with trailing semicolon (2)",
|
||||
"input":"I'm ∉",
|
||||
"output":[["Character","I'm \u2209"]]},
|
||||
|
||||
{"description":"Entity without trailing semicolon (1)",
|
||||
"input":"I'm ¬it",
|
||||
"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACit"]]},
|
||||
|
||||
{"description":"Entity without trailing semicolon (2)",
|
||||
"input":"I'm ¬in",
|
||||
"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACin"]]},
|
||||
|
||||
{"description":"Partial entity match at end of file",
|
||||
"input":"I'm &no",
|
||||
"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
|
||||
|
||||
{"description":"Non-ASCII character reference name",
|
||||
"input":"&\u00AC;",
|
||||
"output":["ParseError", ["Character", "&\u00AC;"]]},
|
||||
|
||||
{"description":"ASCII decimal entity",
|
||||
"input":"$",
|
||||
"output":[["Character","$"]]},
|
||||
|
||||
{"description":"ASCII hexadecimal entity",
|
||||
"input":"?",
|
||||
"output":[["Character","?"]]},
|
||||
|
||||
{"description":"Hexadecimal entity in attribute",
|
||||
"input":"<h a='?'></h>",
|
||||
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon ending in x",
|
||||
"input":"<h a='¬x'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"¬x"}]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon ending in 1",
|
||||
"input":"<h a='¬1'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"¬1"}]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon ending in i",
|
||||
"input":"<h a='¬i'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"¬i"}]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon",
|
||||
"input":"<h a='©'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"\u00A9"}]]},
|
||||
|
||||
{"description":"Unquoted attribute ending in ampersand",
|
||||
"input":"<s o=& t>",
|
||||
"output":[["StartTag","s",{"o":"&","t":""}]]},
|
||||
|
||||
{"description":"Unquoted attribute at end of tag with final character of &, with tag followed by characters",
|
||||
"input":"<a a=a&>foo",
|
||||
"output":[["StartTag", "a", {"a":"a&"}], ["Character", "foo"]]},
|
||||
|
||||
{"description":"plaintext element",
|
||||
"input":"<plaintext>foobar",
|
||||
"output":[["StartTag","plaintext",{}], ["Character","foobar"]]},
|
||||
|
||||
{"description":"Open angled bracket in unquoted attribute value state",
|
||||
"input":"<a a=f<>",
|
||||
"output":["ParseError", ["StartTag", "a", {"a":"f<"}]]}
|
||||
|
||||
]}
|
||||
179
html5lib/tests/testdata/tokenizer/test2.test
vendored
Normal file
179
html5lib/tests/testdata/tokenizer/test2.test
vendored
Normal file
@@ -0,0 +1,179 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"DOCTYPE without name",
|
||||
"input":"<!DOCTYPE>",
|
||||
"output":["ParseError", "ParseError", ["DOCTYPE", "", null, null, false]]},
|
||||
|
||||
{"description":"DOCTYPE without space before name",
|
||||
"input":"<!DOCTYPEhtml>",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
|
||||
|
||||
{"description":"Incorrect DOCTYPE without a space before name",
|
||||
"input":"<!DOCTYPEfoo>",
|
||||
"output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
|
||||
|
||||
{"description":"DOCTYPE with publicId",
|
||||
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
|
||||
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
|
||||
|
||||
{"description":"DOCTYPE with EOF after PUBLIC",
|
||||
"input":"<!DOCTYPE html PUBLIC",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"DOCTYPE with EOF after PUBLIC '",
|
||||
"input":"<!DOCTYPE html PUBLIC '",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
|
||||
|
||||
{"description":"DOCTYPE with EOF after PUBLIC 'x",
|
||||
"input":"<!DOCTYPE html PUBLIC 'x",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
|
||||
|
||||
{"description":"DOCTYPE with systemId",
|
||||
"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
|
||||
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
|
||||
|
||||
{"description":"DOCTYPE with publicId and systemId",
|
||||
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
|
||||
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
|
||||
|
||||
{"description":"DOCTYPE with > in double-quoted publicId",
|
||||
"input":"<!DOCTYPE html PUBLIC \">x",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
|
||||
|
||||
{"description":"DOCTYPE with > in single-quoted publicId",
|
||||
"input":"<!DOCTYPE html PUBLIC '>x",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
|
||||
|
||||
{"description":"DOCTYPE with > in double-quoted systemId",
|
||||
"input":"<!DOCTYPE html PUBLIC \"foo\" \">x",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
|
||||
|
||||
{"description":"DOCTYPE with > in single-quoted systemId",
|
||||
"input":"<!DOCTYPE html PUBLIC 'foo' '>x",
|
||||
"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
|
||||
|
||||
{"description":"Incomplete doctype",
|
||||
"input":"<!DOCTYPE html ",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Numeric entity representing the NUL character",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity representing the NUL character",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity pair representing a surrogate pair",
|
||||
"input":"��",
|
||||
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity with mixed uppercase and lowercase",
|
||||
"input":"ꯍ",
|
||||
"output":[["Character", "\uABCD"]]},
|
||||
|
||||
{"description":"Entity without a name",
|
||||
"input":"&;",
|
||||
"output":["ParseError", ["Character", "&;"]]},
|
||||
|
||||
{"description":"Unescaped ampersand in attribute value",
|
||||
"input":"<h a='&'>",
|
||||
"output":[["StartTag", "h", { "a":"&" }]]},
|
||||
|
||||
{"description":"StartTag containing <",
|
||||
"input":"<a<b>",
|
||||
"output":[["StartTag", "a<b", { }]]},
|
||||
|
||||
{"description":"Non-void element containing trailing /",
|
||||
"input":"<h/>",
|
||||
"output":[["StartTag","h",{},true]]},
|
||||
|
||||
{"description":"Void element with permitted slash",
|
||||
"input":"<br/>",
|
||||
"output":[["StartTag","br",{},true]]},
|
||||
|
||||
{"description":"Void element with permitted slash (with attribute)",
|
||||
"input":"<br foo='bar'/>",
|
||||
"output":[["StartTag","br",{"foo":"bar"},true]]},
|
||||
|
||||
{"description":"StartTag containing /",
|
||||
"input":"<h/a='b'>",
|
||||
"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
|
||||
|
||||
{"description":"Double-quoted attribute value",
|
||||
"input":"<h a=\"b\">",
|
||||
"output":[["StartTag", "h", { "a":"b" }]]},
|
||||
|
||||
{"description":"Unescaped </",
|
||||
"input":"</",
|
||||
"output":["ParseError", ["Character", "</"]]},
|
||||
|
||||
{"description":"Illegal end tag name",
|
||||
"input":"</1>",
|
||||
"output":["ParseError", ["Comment", "1"]]},
|
||||
|
||||
{"description":"Simili processing instruction",
|
||||
"input":"<?namespace>",
|
||||
"output":["ParseError", ["Comment", "?namespace"]]},
|
||||
|
||||
{"description":"A bogus comment stops at >, even if preceeded by two dashes",
|
||||
"input":"<?foo-->",
|
||||
"output":["ParseError", ["Comment", "?foo--"]]},
|
||||
|
||||
{"description":"Unescaped <",
|
||||
"input":"foo < bar",
|
||||
"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
|
||||
|
||||
{"description":"Null Byte Replacement",
|
||||
"input":"\u0000",
|
||||
"output":["ParseError", ["Character", "\u0000"]]},
|
||||
|
||||
{"description":"Comment with dash",
|
||||
"input":"<!---x",
|
||||
"output":["ParseError", ["Comment", "-x"]]},
|
||||
|
||||
{"description":"Entity + newline",
|
||||
"input":"\nx\n>\n",
|
||||
"output":[["Character","\nx\n>\n"]]},
|
||||
|
||||
{"description":"Start tag with no attributes but space before the greater-than sign",
|
||||
"input":"<h >",
|
||||
"output":[["StartTag", "h", {}]]},
|
||||
|
||||
{"description":"Empty attribute followed by uppercase attribute",
|
||||
"input":"<h a B=''>",
|
||||
"output":[["StartTag", "h", {"a":"", "b":""}]]},
|
||||
|
||||
{"description":"Double-quote after attribute name",
|
||||
"input":"<h a \">",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"", "\"":""}]]},
|
||||
|
||||
{"description":"Single-quote after attribute name",
|
||||
"input":"<h a '>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"", "'":""}]]},
|
||||
|
||||
{"description":"Empty end tag with following characters",
|
||||
"input":"a</>bc",
|
||||
"output":[["Character", "a"], "ParseError", ["Character", "bc"]]},
|
||||
|
||||
{"description":"Empty end tag with following tag",
|
||||
"input":"a</><b>c",
|
||||
"output":[["Character", "a"], "ParseError", ["StartTag", "b", {}], ["Character", "c"]]},
|
||||
|
||||
{"description":"Empty end tag with following comment",
|
||||
"input":"a</><!--b-->c",
|
||||
"output":[["Character", "a"], "ParseError", ["Comment", "b"], ["Character", "c"]]},
|
||||
|
||||
{"description":"Empty end tag with following end tag",
|
||||
"input":"a</></b>c",
|
||||
"output":[["Character", "a"], "ParseError", ["EndTag", "b"], ["Character", "c"]]}
|
||||
|
||||
]}
|
||||
6047
html5lib/tests/testdata/tokenizer/test3.test
vendored
Normal file
6047
html5lib/tests/testdata/tokenizer/test3.test
vendored
Normal file
File diff suppressed because it is too large
Load Diff
344
html5lib/tests/testdata/tokenizer/test4.test
vendored
Normal file
344
html5lib/tests/testdata/tokenizer/test4.test
vendored
Normal file
@@ -0,0 +1,344 @@
|
||||
{"tests": [
|
||||
|
||||
{"description":"< in attribute name",
|
||||
"input":"<z/0 <>",
|
||||
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
|
||||
|
||||
{"description":"< in attribute value",
|
||||
"input":"<z x=<>",
|
||||
"output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
|
||||
|
||||
{"description":"= in unquoted attribute value",
|
||||
"input":"<z z=z=z>",
|
||||
"output":["ParseError", ["StartTag", "z", {"z": "z=z"}]]},
|
||||
|
||||
{"description":"= attribute",
|
||||
"input":"<z =>",
|
||||
"output":["ParseError", ["StartTag", "z", {"=": ""}]]},
|
||||
|
||||
{"description":"== attribute",
|
||||
"input":"<z ==>",
|
||||
"output":["ParseError", "ParseError", ["StartTag", "z", {"=": ""}]]},
|
||||
|
||||
{"description":"=== attribute",
|
||||
"input":"<z ===>",
|
||||
"output":["ParseError", "ParseError", ["StartTag", "z", {"=": "="}]]},
|
||||
|
||||
{"description":"==== attribute",
|
||||
"input":"<z ====>",
|
||||
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"=": "=="}]]},
|
||||
|
||||
{"description":"Allowed \" after ampersand in attribute value",
|
||||
"input":"<z z=\"&\">",
|
||||
"output":[["StartTag", "z", {"z": "&"}]]},
|
||||
|
||||
{"description":"Non-allowed ' after ampersand in attribute value",
|
||||
"input":"<z z=\"&'\">",
|
||||
"output":["ParseError", ["StartTag", "z", {"z": "&'"}]]},
|
||||
|
||||
{"description":"Allowed ' after ampersand in attribute value",
|
||||
"input":"<z z='&'>",
|
||||
"output":[["StartTag", "z", {"z": "&"}]]},
|
||||
|
||||
{"description":"Non-allowed \" after ampersand in attribute value",
|
||||
"input":"<z z='&\"'>",
|
||||
"output":["ParseError", ["StartTag", "z", {"z": "&\""}]]},
|
||||
|
||||
{"description":"Text after bogus character reference",
|
||||
"input":"<z z='&xlink_xmlns;'>bar<z>",
|
||||
"output":["ParseError",["StartTag","z",{"z":"&xlink_xmlns;"}],["Character","bar"],["StartTag","z",{}]]},
|
||||
|
||||
{"description":"Text after hex character reference",
|
||||
"input":"<z z='  foo'>bar<z>",
|
||||
"output":[["StartTag","z",{"z":" foo"}],["Character","bar"],["StartTag","z",{}]]},
|
||||
|
||||
{"description":"Attribute name starting with \"",
|
||||
"input":"<foo \"='bar'>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"\"": "bar"}]]},
|
||||
|
||||
{"description":"Attribute name starting with '",
|
||||
"input":"<foo '='bar'>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"'": "bar"}]]},
|
||||
|
||||
{"description":"Attribute name containing \"",
|
||||
"input":"<foo a\"b='bar'>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a\"b": "bar"}]]},
|
||||
|
||||
{"description":"Attribute name containing '",
|
||||
"input":"<foo a'b='bar'>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a'b": "bar"}]]},
|
||||
|
||||
{"description":"Unquoted attribute value containing '",
|
||||
"input":"<foo a=b'c>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a": "b'c"}]]},
|
||||
|
||||
{"description":"Unquoted attribute value containing \"",
|
||||
"input":"<foo a=b\"c>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a": "b\"c"}]]},
|
||||
|
||||
{"description":"Double-quoted attribute value not followed by whitespace",
|
||||
"input":"<foo a=\"b\"c>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
|
||||
|
||||
{"description":"Single-quoted attribute value not followed by whitespace",
|
||||
"input":"<foo a='b'c>",
|
||||
"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
|
||||
|
||||
{"description":"Quoted attribute followed by permitted /",
|
||||
"input":"<br a='b'/>",
|
||||
"output":[["StartTag","br",{"a":"b"},true]]},
|
||||
|
||||
{"description":"Quoted attribute followed by non-permitted /",
|
||||
"input":"<bar a='b'/>",
|
||||
"output":[["StartTag","bar",{"a":"b"},true]]},
|
||||
|
||||
{"description":"CR EOF after doctype name",
|
||||
"input":"<!doctype html \r",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"CR EOF in tag name",
|
||||
"input":"<z\r",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"Slash EOF in tag name",
|
||||
"input":"<z/",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"Zero hex numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Zero decimal numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Zero-prefixed hex numeric entity",
|
||||
"input":"A",
|
||||
"output":[["Character", "A"]]},
|
||||
|
||||
{"description":"Zero-prefixed decimal numeric entity",
|
||||
"input":"A",
|
||||
"output":[["Character", "A"]]},
|
||||
|
||||
{"description":"Empty hex numeric entities",
|
||||
"input":"&#x &#X ",
|
||||
"output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
|
||||
|
||||
{"description":"Empty decimal numeric entities",
|
||||
"input":"&# &#; ",
|
||||
"output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
|
||||
|
||||
{"description":"Non-BMP numeric entity",
|
||||
"input":"𐀀",
|
||||
"output":[["Character", "\uD800\uDC00"]]},
|
||||
|
||||
{"description":"Maximum non-BMP numeric entity",
|
||||
"input":"",
|
||||
"output":["ParseError", ["Character", "\uDBFF\uDFFF"]]},
|
||||
|
||||
{"description":"Above maximum numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"32-bit hex numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"33-bit hex numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"33-bit decimal numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"65-bit hex numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"65-bit decimal numeric entity",
|
||||
"input":"�",
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Surrogate code point edge cases",
|
||||
"input":"퟿����",
|
||||
"output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
|
||||
|
||||
{"description":"Uppercase start tag name",
|
||||
"input":"<X>",
|
||||
"output":[["StartTag", "x", {}]]},
|
||||
|
||||
{"description":"Uppercase end tag name",
|
||||
"input":"</X>",
|
||||
"output":[["EndTag", "x"]]},
|
||||
|
||||
{"description":"Uppercase attribute name",
|
||||
"input":"<x X>",
|
||||
"output":[["StartTag", "x", { "x":"" }]]},
|
||||
|
||||
{"description":"Tag/attribute name case edge values",
|
||||
"input":"<x@AZ[`az{ @AZ[`az{>",
|
||||
"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
|
||||
|
||||
{"description":"Duplicate different-case attributes",
|
||||
"input":"<x x=1 x=2 X=3>",
|
||||
"output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
|
||||
|
||||
{"description":"Uppercase close tag attributes",
|
||||
"input":"</x X>",
|
||||
"output":["ParseError", ["EndTag", "x"]]},
|
||||
|
||||
{"description":"Duplicate close tag attributes",
|
||||
"input":"</x x x>",
|
||||
"output":["ParseError", "ParseError", ["EndTag", "x"]]},
|
||||
|
||||
{"description":"Permitted slash",
|
||||
"input":"<br/>",
|
||||
"output":[["StartTag","br",{},true]]},
|
||||
|
||||
{"description":"Non-permitted slash",
|
||||
"input":"<xr/>",
|
||||
"output":[["StartTag","xr",{},true]]},
|
||||
|
||||
{"description":"Permitted slash but in close tag",
|
||||
"input":"</br/>",
|
||||
"output":["ParseError", ["EndTag", "br"]]},
|
||||
|
||||
{"description":"Doctype public case-sensitivity (1)",
|
||||
"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
|
||||
"output":[["DOCTYPE", "html", "AbC", "XyZ", true]]},
|
||||
|
||||
{"description":"Doctype public case-sensitivity (2)",
|
||||
"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
|
||||
"output":[["DOCTYPE", "html", "aBc", "xYz", true]]},
|
||||
|
||||
{"description":"Doctype system case-sensitivity (1)",
|
||||
"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
|
||||
"output":[["DOCTYPE", "html", null, "XyZ", true]]},
|
||||
|
||||
{"description":"Doctype system case-sensitivity (2)",
|
||||
"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
|
||||
"output":[["DOCTYPE", "html", null, "xYz", true]]},
|
||||
|
||||
{"description":"U+0000 in lookahead region after non-matching character",
|
||||
"input":"<!doc>\u0000",
|
||||
"output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\u0000"]],
|
||||
"ignoreErrorOrder":true},
|
||||
|
||||
{"description":"U+0000 in lookahead region",
|
||||
"input":"<!doc\u0000",
|
||||
"output":["ParseError", ["Comment", "doc\uFFFD"]],
|
||||
"ignoreErrorOrder":true},
|
||||
|
||||
{"description":"U+0080 in lookahead region",
|
||||
"input":"<!doc\u0080",
|
||||
"output":["ParseError", "ParseError", ["Comment", "doc\u0080"]],
|
||||
"ignoreErrorOrder":true},
|
||||
|
||||
{"description":"U+FDD1 in lookahead region",
|
||||
"input":"<!doc\uFDD1",
|
||||
"output":["ParseError", "ParseError", ["Comment", "doc\uFDD1"]],
|
||||
"ignoreErrorOrder":true},
|
||||
|
||||
{"description":"U+1FFFF in lookahead region",
|
||||
"input":"<!doc\uD83F\uDFFF",
|
||||
"output":["ParseError", "ParseError", ["Comment", "doc\uD83F\uDFFF"]],
|
||||
"ignoreErrorOrder":true},
|
||||
|
||||
{"description":"CR followed by non-LF",
|
||||
"input":"\r?",
|
||||
"output":[["Character", "\n?"]]},
|
||||
|
||||
{"description":"CR at EOF",
|
||||
"input":"\r",
|
||||
"output":[["Character", "\n"]]},
|
||||
|
||||
{"description":"LF at EOF",
|
||||
"input":"\n",
|
||||
"output":[["Character", "\n"]]},
|
||||
|
||||
{"description":"CR LF",
|
||||
"input":"\r\n",
|
||||
"output":[["Character", "\n"]]},
|
||||
|
||||
{"description":"CR CR",
|
||||
"input":"\r\r",
|
||||
"output":[["Character", "\n\n"]]},
|
||||
|
||||
{"description":"LF LF",
|
||||
"input":"\n\n",
|
||||
"output":[["Character", "\n\n"]]},
|
||||
|
||||
{"description":"LF CR",
|
||||
"input":"\n\r",
|
||||
"output":[["Character", "\n\n"]]},
|
||||
|
||||
{"description":"text CR CR CR text",
|
||||
"input":"text\r\r\rtext",
|
||||
"output":[["Character", "text\n\n\ntext"]]},
|
||||
|
||||
{"description":"Doctype publik",
|
||||
"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Doctype publi",
|
||||
"input":"<!DOCTYPE html PUBLI",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Doctype sistem",
|
||||
"input":"<!DOCTYPE html SISTEM \"AbC\">",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Doctype sys",
|
||||
"input":"<!DOCTYPE html SYS",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
|
||||
|
||||
{"description":"Doctype html x>text",
|
||||
"input":"<!DOCTYPE html x>text",
|
||||
"output":["ParseError", ["DOCTYPE", "html", null, null, false], ["Character", "text"]]},
|
||||
|
||||
{"description":"Grave accent in unquoted attribute",
|
||||
"input":"<a a=aa`>",
|
||||
"output":["ParseError", ["StartTag", "a", {"a":"aa`"}]]},
|
||||
|
||||
{"description":"EOF in tag name state ",
|
||||
"input":"<a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in tag name state",
|
||||
"input":"<a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in before attribute name state",
|
||||
"input":"<a ",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in attribute name state",
|
||||
"input":"<a a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in after attribute name state",
|
||||
"input":"<a a ",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in before attribute value state",
|
||||
"input":"<a a =",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in attribute value (double quoted) state",
|
||||
"input":"<a a =\"a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in attribute value (single quoted) state",
|
||||
"input":"<a a ='a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in attribute value (unquoted) state",
|
||||
"input":"<a a =a",
|
||||
"output":["ParseError"]},
|
||||
|
||||
{"description":"EOF in after attribute value state",
|
||||
"input":"<a a ='a'",
|
||||
"output":["ParseError"]}
|
||||
|
||||
]}
|
||||
1295
html5lib/tests/testdata/tokenizer/unicodeChars.test
vendored
Normal file
1295
html5lib/tests/testdata/tokenizer/unicodeChars.test
vendored
Normal file
File diff suppressed because it is too large
Load Diff
27
html5lib/tests/testdata/tokenizer/unicodeCharsProblematic.test
vendored
Normal file
27
html5lib/tests/testdata/tokenizer/unicodeCharsProblematic.test
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
{"tests" : [
|
||||
{"description": "Invalid Unicode character U+DFFF",
|
||||
"doubleEscaped":true,
|
||||
"input": "\\uDFFF",
|
||||
"output":["ParseError", ["Character", "\\uFFFD"]]},
|
||||
|
||||
{"description": "Invalid Unicode character U+D800",
|
||||
"doubleEscaped":true,
|
||||
"input": "\\uD800",
|
||||
"output":["ParseError", ["Character", "\\uFFFD"]]},
|
||||
|
||||
{"description": "Invalid Unicode character U+DFFF with valid preceding character",
|
||||
"doubleEscaped":true,
|
||||
"input": "a\\uDFFF",
|
||||
"output":["ParseError", ["Character", "a\\uFFFD"]]},
|
||||
|
||||
{"description": "Invalid Unicode character U+D800 with valid following character",
|
||||
"doubleEscaped":true,
|
||||
"input": "\\uD800a",
|
||||
"output":["ParseError", ["Character", "\\uFFFDa"]]},
|
||||
|
||||
{"description":"CR followed by U+0000",
|
||||
"input":"\r\u0000",
|
||||
"output":[["Character", "\n"], "ParseError", ["Character", "\u0000"]],
|
||||
"ignoreErrorOrder":true}
|
||||
]
|
||||
}
|
||||
22
html5lib/tests/testdata/tokenizer/xmlViolation.test
vendored
Normal file
22
html5lib/tests/testdata/tokenizer/xmlViolation.test
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{"xmlViolationTests": [
|
||||
|
||||
{"description":"Non-XML character",
|
||||
"input":"a\uFFFFb",
|
||||
"ignoreErrorOrder":true,
|
||||
"output":["ParseError",["Character","a\uFFFDb"]]},
|
||||
|
||||
{"description":"Non-XML space",
|
||||
"input":"a\u000Cb",
|
||||
"ignoreErrorOrder":true,
|
||||
"output":[["Character","a b"]]},
|
||||
|
||||
{"description":"Double hyphen in comment",
|
||||
"input":"<!-- foo -- bar -->",
|
||||
"output":["ParseError",["Comment"," foo - - bar "]]},
|
||||
|
||||
{"description":"FF between attributes",
|
||||
"input":"<a b=''\u000Cc=''>",
|
||||
"output":[["StartTag","a",{"b":"","c":""}]]}
|
||||
]}
|
||||
|
||||
|
||||
194
html5lib/tests/testdata/tree-construction/adoption01.dat
vendored
Normal file
194
html5lib/tests/testdata/tree-construction/adoption01.dat
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
#data
|
||||
<a><p></a></p>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| <p>
|
||||
| <a>
|
||||
|
||||
#data
|
||||
<a>1<p>2</a>3</p>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <p>
|
||||
| <a>
|
||||
| "2"
|
||||
| "3"
|
||||
|
||||
#data
|
||||
<a>1<button>2</a>3</button>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <button>
|
||||
| <a>
|
||||
| "2"
|
||||
| "3"
|
||||
|
||||
#data
|
||||
<a>1<b>2</a>3</b>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <b>
|
||||
| "2"
|
||||
| <b>
|
||||
| "3"
|
||||
|
||||
#data
|
||||
<a>1<div>2<div>3</a>4</div>5</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <div>
|
||||
| <a>
|
||||
| "2"
|
||||
| <div>
|
||||
| <a>
|
||||
| "3"
|
||||
| "4"
|
||||
| "5"
|
||||
|
||||
#data
|
||||
<table><a>1<p>2</a>3</p>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <p>
|
||||
| <a>
|
||||
| "2"
|
||||
| "3"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<b><b><a><p></a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <b>
|
||||
| <b>
|
||||
| <a>
|
||||
| <p>
|
||||
| <a>
|
||||
|
||||
#data
|
||||
<b><a><b><p></a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <b>
|
||||
| <a>
|
||||
| <b>
|
||||
| <b>
|
||||
| <p>
|
||||
| <a>
|
||||
|
||||
#data
|
||||
<a><b><b><p></a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| <b>
|
||||
| <b>
|
||||
| <b>
|
||||
| <b>
|
||||
| <p>
|
||||
| <a>
|
||||
|
||||
#data
|
||||
<p>1<s id="A">2<b id="B">3</p>4</s>5</b>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| "1"
|
||||
| <s>
|
||||
| id="A"
|
||||
| "2"
|
||||
| <b>
|
||||
| id="B"
|
||||
| "3"
|
||||
| <s>
|
||||
| id="A"
|
||||
| <b>
|
||||
| id="B"
|
||||
| "4"
|
||||
| <b>
|
||||
| id="B"
|
||||
| "5"
|
||||
|
||||
#data
|
||||
<table><a>1<td>2</td>3</table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| "1"
|
||||
| <a>
|
||||
| "3"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "2"
|
||||
|
||||
#data
|
||||
<table>A<td>B</td>C</table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "AC"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "B"
|
||||
|
||||
#data
|
||||
<a><svg><tr><input></a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| <svg svg>
|
||||
| <svg tr>
|
||||
| <svg input>
|
||||
31
html5lib/tests/testdata/tree-construction/adoption02.dat
vendored
Normal file
31
html5lib/tests/testdata/tree-construction/adoption02.dat
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
#data
|
||||
<b>1<i>2<p>3</b>4
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <b>
|
||||
| "1"
|
||||
| <i>
|
||||
| "2"
|
||||
| <i>
|
||||
| <p>
|
||||
| <b>
|
||||
| "3"
|
||||
| "4"
|
||||
|
||||
#data
|
||||
<a><div><style></style><address><a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| <div>
|
||||
| <a>
|
||||
| <style>
|
||||
| <address>
|
||||
| <a>
|
||||
| <a>
|
||||
135
html5lib/tests/testdata/tree-construction/comments01.dat
vendored
Normal file
135
html5lib/tests/testdata/tree-construction/comments01.dat
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
#data
|
||||
FOO<!-- BAR -->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!-- BAR --!>BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!-- BAR -- >BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -- >BAZ -->
|
||||
|
||||
#data
|
||||
FOO<!-- BAR -- <QUX> -- MUX -->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -- <QUX> -- MUX -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!-- BAR -- <QUX> -- MUX --!>BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -- <QUX> -- MUX -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!-- BAR -- <QUX> -- MUX -- >BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- BAR -- <QUX> -- MUX -- >BAZ -->
|
||||
|
||||
#data
|
||||
FOO<!---->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!--->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
FOO<!-->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- -->
|
||||
| "BAZ"
|
||||
|
||||
#data
|
||||
<?xml version="1.0">Hi
|
||||
#errors
|
||||
#document
|
||||
| <!-- ?xml version="1.0" -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hi"
|
||||
|
||||
#data
|
||||
<?xml version="1.0">
|
||||
#errors
|
||||
#document
|
||||
| <!-- ?xml version="1.0" -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<?xml version
|
||||
#errors
|
||||
#document
|
||||
| <!-- ?xml version -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
FOO<!----->BAZ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <!-- - -->
|
||||
| "BAZ"
|
||||
370
html5lib/tests/testdata/tree-construction/doctype01.dat
vendored
Normal file
370
html5lib/tests/testdata/tree-construction/doctype01.dat
vendored
Normal file
@@ -0,0 +1,370 @@
|
||||
#data
|
||||
<!DOCTYPE html>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!dOctYpE HtMl>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPEhtml>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE >
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE >
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato taco>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato taco "ddd>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato sYstEM>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato sYstEM >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato sYstEM ggg>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato SYSTEM taco >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato SYSTEM 'taco"'>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "" "taco"">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato SYSTEM "taco">Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "" "taco">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato SYSTEM "tai'co">Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "" "tai'co">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato SYSTEMtaco "ddd">Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato grass SYSTEM taco>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato pUbLIc>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato pUbLIc >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato pUbLIcgoof>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato PUBLIC goof>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato PUBLIC "go'of">Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "go'of" "">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato PUBLIC 'go'of'>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "go" "">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato PUBLIC 'go:hh of' >Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "go:hh of" "">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE potato PUBLIC "W3C-//dfdf" SYSTEM ggg>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE potato "W3C-//dfdf" "">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE ...>Hello
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE ...>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Hello"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE root-element [SYSTEM OR PUBLIC FPI] "uri" [
|
||||
<!-- internal declarations -->
|
||||
]>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE root-element>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "]>"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html PUBLIC
|
||||
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
|
||||
"http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"><body><b>Mine!</b></body>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "" "http://www.w3.org/DTD/HTML4-strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <b>
|
||||
| "Mine!"
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
BIN
html5lib/tests/testdata/tree-construction/domjs-unsafe.dat
vendored
Normal file
BIN
html5lib/tests/testdata/tree-construction/domjs-unsafe.dat
vendored
Normal file
Binary file not shown.
603
html5lib/tests/testdata/tree-construction/entities01.dat
vendored
Normal file
603
html5lib/tests/testdata/tree-construction/entities01.dat
vendored
Normal file
@@ -0,0 +1,603 @@
|
||||
#data
|
||||
FOO>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO>BAR"
|
||||
|
||||
#data
|
||||
FOO>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO>BAR"
|
||||
|
||||
#data
|
||||
FOO> BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO> BAR"
|
||||
|
||||
#data
|
||||
FOO>;;BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO>;;BAR"
|
||||
|
||||
#data
|
||||
I'm ¬it; I tell you
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "I'm ¬it; I tell you"
|
||||
|
||||
#data
|
||||
I'm ∉ I tell you
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "I'm ∉ I tell you"
|
||||
|
||||
#data
|
||||
FOO& BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO& BAR"
|
||||
|
||||
#data
|
||||
FOO&<BAR>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&"
|
||||
| <bar>
|
||||
|
||||
#data
|
||||
FOO&&&>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&&&>BAR"
|
||||
|
||||
#data
|
||||
FOO)BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO)BAR"
|
||||
|
||||
#data
|
||||
FOOABAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOABAR"
|
||||
|
||||
#data
|
||||
FOOABAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOABAR"
|
||||
|
||||
#data
|
||||
FOO&#BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&#BAR"
|
||||
|
||||
#data
|
||||
FOO&#ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&#ZOO"
|
||||
|
||||
#data
|
||||
FOOºR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOºR"
|
||||
|
||||
#data
|
||||
FOO&#xZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&#xZOO"
|
||||
|
||||
#data
|
||||
FOO&#XZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO&#XZOO"
|
||||
|
||||
#data
|
||||
FOO)BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO)BAR"
|
||||
|
||||
#data
|
||||
FOO䆺R
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO䆺R"
|
||||
|
||||
#data
|
||||
FOOAZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOAZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOOxZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOxZOO"
|
||||
|
||||
#data
|
||||
FOOyZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOyZOO"
|
||||
|
||||
#data
|
||||
FOO€ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO€ZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOO‚ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO‚ZOO"
|
||||
|
||||
#data
|
||||
FOOƒZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOƒZOO"
|
||||
|
||||
#data
|
||||
FOO„ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO„ZOO"
|
||||
|
||||
#data
|
||||
FOO…ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO…ZOO"
|
||||
|
||||
#data
|
||||
FOO†ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO†ZOO"
|
||||
|
||||
#data
|
||||
FOO‡ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO‡ZOO"
|
||||
|
||||
#data
|
||||
FOOˆZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOˆZOO"
|
||||
|
||||
#data
|
||||
FOO‰ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO‰ZOO"
|
||||
|
||||
#data
|
||||
FOOŠZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOŠZOO"
|
||||
|
||||
#data
|
||||
FOO‹ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO‹ZOO"
|
||||
|
||||
#data
|
||||
FOOŒZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOŒZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOOŽZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOŽZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOO‘ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO‘ZOO"
|
||||
|
||||
#data
|
||||
FOO’ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO’ZOO"
|
||||
|
||||
#data
|
||||
FOO“ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO“ZOO"
|
||||
|
||||
#data
|
||||
FOO”ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO”ZOO"
|
||||
|
||||
#data
|
||||
FOO•ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO•ZOO"
|
||||
|
||||
#data
|
||||
FOO–ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO–ZOO"
|
||||
|
||||
#data
|
||||
FOO—ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO—ZOO"
|
||||
|
||||
#data
|
||||
FOO˜ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO˜ZOO"
|
||||
|
||||
#data
|
||||
FOO™ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO™ZOO"
|
||||
|
||||
#data
|
||||
FOOšZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOšZOO"
|
||||
|
||||
#data
|
||||
FOO›ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO›ZOO"
|
||||
|
||||
#data
|
||||
FOOœZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOœZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOOžZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOžZOO"
|
||||
|
||||
#data
|
||||
FOOŸZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOŸZOO"
|
||||
|
||||
#data
|
||||
FOO ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO ZOO"
|
||||
|
||||
#data
|
||||
FOO퟿ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOO􈟔ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOOZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOOZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
|
||||
#data
|
||||
FOO�ZOO
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO<4F>ZOO"
|
||||
249
html5lib/tests/testdata/tree-construction/entities02.dat
vendored
Normal file
249
html5lib/tests/testdata/tree-construction/entities02.dat
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
#data
|
||||
<div bar="ZZ>YY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>YY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ&"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ&"
|
||||
|
||||
#data
|
||||
<div bar='ZZ&'></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ&"
|
||||
|
||||
#data
|
||||
<div bar=ZZ&></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ&"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>=YY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>=YY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>0YY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>0YY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>9YY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>9YY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>aYY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>aYY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>ZYY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>ZYY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ> YY"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ> YY"
|
||||
|
||||
#data
|
||||
<div bar="ZZ>"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>"
|
||||
|
||||
#data
|
||||
<div bar='ZZ>'></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>"
|
||||
|
||||
#data
|
||||
<div bar=ZZ>></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ>"
|
||||
|
||||
#data
|
||||
<div bar="ZZ£_id=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ£_id=23"
|
||||
|
||||
#data
|
||||
<div bar="ZZ&prod_id=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ&prod_id=23"
|
||||
|
||||
#data
|
||||
<div bar="ZZ£_id=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ£_id=23"
|
||||
|
||||
#data
|
||||
<div bar="ZZ∏_id=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ∏_id=23"
|
||||
|
||||
#data
|
||||
<div bar="ZZ£=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ£=23"
|
||||
|
||||
#data
|
||||
<div bar="ZZ&prod=23"></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| bar="ZZ&prod=23"
|
||||
|
||||
#data
|
||||
<div>ZZ£_id=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ£_id=23"
|
||||
|
||||
#data
|
||||
<div>ZZ&prod_id=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ&prod_id=23"
|
||||
|
||||
#data
|
||||
<div>ZZ£_id=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ£_id=23"
|
||||
|
||||
#data
|
||||
<div>ZZ∏_id=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ∏_id=23"
|
||||
|
||||
#data
|
||||
<div>ZZ£=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ£=23"
|
||||
|
||||
#data
|
||||
<div>ZZ&prod=23</div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| "ZZ&prod=23"
|
||||
246
html5lib/tests/testdata/tree-construction/html5test-com.dat
vendored
Normal file
246
html5lib/tests/testdata/tree-construction/html5test-com.dat
vendored
Normal file
@@ -0,0 +1,246 @@
|
||||
#data
|
||||
<div<div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div<div>
|
||||
|
||||
#data
|
||||
<div foo<bar=''>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| foo<bar=""
|
||||
|
||||
#data
|
||||
<div foo=`bar`>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| foo="`bar`"
|
||||
|
||||
#data
|
||||
<div \"foo=''>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| \"foo=""
|
||||
|
||||
#data
|
||||
<a href='\nbar'></a>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <a>
|
||||
| href="\nbar"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
⟨⟩
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "⟨⟩"
|
||||
|
||||
#data
|
||||
'
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "'"
|
||||
|
||||
#data
|
||||
ⅈ
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "ⅈ"
|
||||
|
||||
#data
|
||||
𝕂
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "𝕂"
|
||||
|
||||
#data
|
||||
∉
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "∉"
|
||||
|
||||
#data
|
||||
<?import namespace="foo" implementation="#bar">
|
||||
#errors
|
||||
#document
|
||||
| <!-- ?import namespace="foo" implementation="#bar" -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!--foo--bar-->
|
||||
#errors
|
||||
#document
|
||||
| <!-- foo--bar -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<![CDATA[x]]>
|
||||
#errors
|
||||
#document
|
||||
| <!-- [CDATA[x]] -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<textarea><!--</textarea>--></textarea>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <textarea>
|
||||
| "<!--"
|
||||
| "-->"
|
||||
|
||||
#data
|
||||
<textarea><!--</textarea>-->
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <textarea>
|
||||
| "<!--"
|
||||
| "-->"
|
||||
|
||||
#data
|
||||
<style><!--</style>--></style>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <style>
|
||||
| "<!--"
|
||||
| <body>
|
||||
| "-->"
|
||||
|
||||
#data
|
||||
<style><!--</style>-->
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <style>
|
||||
| "<!--"
|
||||
| <body>
|
||||
| "-->"
|
||||
|
||||
#data
|
||||
<ul><li>A </li> <li>B</li></ul>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <ul>
|
||||
| <li>
|
||||
| "A "
|
||||
| " "
|
||||
| <li>
|
||||
| "B"
|
||||
|
||||
#data
|
||||
<table><form><input type=hidden><input></form><div></div></table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <input>
|
||||
| <div>
|
||||
| <table>
|
||||
| <form>
|
||||
| <input>
|
||||
| type="hidden"
|
||||
|
||||
#data
|
||||
<i>A<b>B<p></i>C</b>D
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <i>
|
||||
| "A"
|
||||
| <b>
|
||||
| "B"
|
||||
| <b>
|
||||
| <p>
|
||||
| <b>
|
||||
| <i>
|
||||
| "C"
|
||||
| "D"
|
||||
|
||||
#data
|
||||
<div></div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<svg></svg>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<math></math>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
43
html5lib/tests/testdata/tree-construction/inbody01.dat
vendored
Normal file
43
html5lib/tests/testdata/tree-construction/inbody01.dat
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
#data
|
||||
<button>1</foo>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <button>
|
||||
| "1"
|
||||
|
||||
#data
|
||||
<foo>1<p>2</foo>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <foo>
|
||||
| "1"
|
||||
| <p>
|
||||
| "2"
|
||||
|
||||
#data
|
||||
<dd>1</foo>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <dd>
|
||||
| "1"
|
||||
|
||||
#data
|
||||
<foo>1<dd>2</foo>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <foo>
|
||||
| "1"
|
||||
| <dd>
|
||||
| "2"
|
||||
40
html5lib/tests/testdata/tree-construction/isindex.dat
vendored
Normal file
40
html5lib/tests/testdata/tree-construction/isindex.dat
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
#data
|
||||
<isindex>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
| <hr>
|
||||
| <label>
|
||||
| "This is a searchable index. Enter search keywords: "
|
||||
| <input>
|
||||
| name="isindex"
|
||||
| <hr>
|
||||
|
||||
#data
|
||||
<isindex name="A" action="B" prompt="C" foo="D">
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
| action="B"
|
||||
| <hr>
|
||||
| <label>
|
||||
| "C"
|
||||
| <input>
|
||||
| foo="D"
|
||||
| name="isindex"
|
||||
| <hr>
|
||||
|
||||
#data
|
||||
<form><isindex>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
BIN
html5lib/tests/testdata/tree-construction/pending-spec-changes-plain-text-unsafe.dat
vendored
Normal file
BIN
html5lib/tests/testdata/tree-construction/pending-spec-changes-plain-text-unsafe.dat
vendored
Normal file
Binary file not shown.
52
html5lib/tests/testdata/tree-construction/pending-spec-changes.dat
vendored
Normal file
52
html5lib/tests/testdata/tree-construction/pending-spec-changes.dat
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
#data
|
||||
<input type="hidden"><frameset>
|
||||
#errors
|
||||
21: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
|
||||
31: “frameset” start tag seen.
|
||||
31: End of file seen and there were open elements.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><table><caption><svg>foo</table>bar
|
||||
#errors
|
||||
47: End tag “table” did not match the name of the current open element (“svg”).
|
||||
47: “table” closed but “caption” was still open.
|
||||
47: End tag “table” seen, but there were open elements.
|
||||
36: Unclosed element “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <svg svg>
|
||||
| "foo"
|
||||
| "bar"
|
||||
|
||||
#data
|
||||
<table><tr><td><svg><desc><td></desc><circle>
|
||||
#errors
|
||||
7: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
|
||||
30: A table cell was implicitly closed, but there were open elements.
|
||||
26: Unclosed element “desc”.
|
||||
20: Unclosed element “svg”.
|
||||
37: Stray end tag “desc”.
|
||||
45: End of file seen and there were open elements.
|
||||
45: Unclosed element “circle”.
|
||||
7: Unclosed element “table”.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <td>
|
||||
| <circle>
|
||||
BIN
html5lib/tests/testdata/tree-construction/plain-text-unsafe.dat
vendored
Normal file
BIN
html5lib/tests/testdata/tree-construction/plain-text-unsafe.dat
vendored
Normal file
Binary file not shown.
308
html5lib/tests/testdata/tree-construction/scriptdata01.dat
vendored
Normal file
308
html5lib/tests/testdata/tree-construction/scriptdata01.dat
vendored
Normal file
@@ -0,0 +1,308 @@
|
||||
#data
|
||||
FOO<script>'Hello'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'Hello'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script></script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script></script >BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script></script/>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script></script/ >BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain"></scriptx>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "</scriptx>BAR"
|
||||
|
||||
#data
|
||||
FOO<script></script foo=">" dd>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!--'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!--'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!---'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!---'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-->'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-->'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-->'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-->'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-- potato'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-- potato'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-- <sCrIpt'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-- <sCrIpt'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt>'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt>'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt> -'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt> -'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt> --'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt> --'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script>'<!-- <sCrIpt> -->'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| "'<!-- <sCrIpt> -->'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt> --!>'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt> --!>'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt> -- >'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt> -- >'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt '</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt '</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt/'</script>BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt\'</script>BAR
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt\'"
|
||||
| "BAR"
|
||||
|
||||
#data
|
||||
FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR</script>QUX
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "FOO"
|
||||
| <script>
|
||||
| type="text/plain"
|
||||
| "'<!-- <sCrIpt/'</script>BAR"
|
||||
| "QUX"
|
||||
212
html5lib/tests/testdata/tree-construction/tables01.dat
vendored
Normal file
212
html5lib/tests/testdata/tree-construction/tables01.dat
vendored
Normal file
@@ -0,0 +1,212 @@
|
||||
#data
|
||||
<table><th>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <th>
|
||||
|
||||
#data
|
||||
<table><td>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<table><col foo='bar'>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <colgroup>
|
||||
| <col>
|
||||
| foo="bar"
|
||||
|
||||
#data
|
||||
<table><colgroup></html>foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "foo"
|
||||
| <table>
|
||||
| <colgroup>
|
||||
|
||||
#data
|
||||
<table></table><p>foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <p>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<table><select><option>3</select></table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <option>
|
||||
| "3"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<table><select><table></table></select></table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <table>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<table><select></table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<table><select><option>A<tr><td>B</td></tr></table>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <option>
|
||||
| "A"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "B"
|
||||
|
||||
#data
|
||||
<table><td></body></caption></col></colgroup></html>foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<table><td>A</table>B
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "A"
|
||||
| "B"
|
||||
|
||||
#data
|
||||
<table><tr><caption>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <caption>
|
||||
|
||||
#data
|
||||
<table><tr></body></caption></col></colgroup></html></td></th><td>foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<table><td><tr>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<table><td><button><td>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <button>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<table><tr><td><svg><desc><td>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <td>
|
||||
1952
html5lib/tests/testdata/tree-construction/tests1.dat
vendored
Normal file
1952
html5lib/tests/testdata/tree-construction/tests1.dat
vendored
Normal file
File diff suppressed because it is too large
Load Diff
799
html5lib/tests/testdata/tree-construction/tests10.dat
vendored
Normal file
799
html5lib/tests/testdata/tree-construction/tests10.dat
vendored
Normal file
@@ -0,0 +1,799 @@
|
||||
#data
|
||||
<!DOCTYPE html><svg></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><svg></svg><![CDATA[a]]>
|
||||
#errors
|
||||
29: Bogus comment
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <!-- [CDATA[a]] -->
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><svg></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><select><svg></svg></select>
|
||||
#errors
|
||||
35: Stray “svg” start tag.
|
||||
42: Stray end tag “svg”
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><select><option><svg></svg></option></select>
|
||||
#errors
|
||||
43: Stray “svg” start tag.
|
||||
50: Stray end tag “svg”
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <option>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><svg></svg></table>
|
||||
#errors
|
||||
34: Start tag “svg” seen in “table”.
|
||||
41: Stray end tag “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><svg><g>foo</g></svg></table>
|
||||
#errors
|
||||
34: Start tag “svg” seen in “table”.
|
||||
46: Stray end tag “g”.
|
||||
53: Stray end tag “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><svg><g>foo</g><g>bar</g></svg></table>
|
||||
#errors
|
||||
34: Start tag “svg” seen in “table”.
|
||||
46: Stray end tag “g”.
|
||||
58: Stray end tag “g”.
|
||||
65: Stray end tag “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><tbody><svg><g>foo</g><g>bar</g></svg></tbody></table>
|
||||
#errors
|
||||
41: Start tag “svg” seen in “table”.
|
||||
53: Stray end tag “g”.
|
||||
65: Stray end tag “g”.
|
||||
72: Stray end tag “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <table>
|
||||
| <tbody>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><tbody><tr><svg><g>foo</g><g>bar</g></svg></tr></tbody></table>
|
||||
#errors
|
||||
45: Start tag “svg” seen in “table”.
|
||||
57: Stray end tag “g”.
|
||||
69: Stray end tag “g”.
|
||||
76: Stray end tag “svg”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg></td></tr></tbody></table>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg><p>baz</td></tr></tbody></table>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g></svg><p>baz</caption></table>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
|
||||
#errors
|
||||
70: HTML start tag “p” in a foreign namespace context.
|
||||
81: “table” closed but “caption” was still open.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
| <p>
|
||||
| "quux"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g>baz</table><p>quux
|
||||
#errors
|
||||
78: “table” closed but “caption” was still open.
|
||||
78: Unclosed elements on stack.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| "baz"
|
||||
| <p>
|
||||
| "quux"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><colgroup><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
|
||||
#errors
|
||||
44: Start tag “svg” seen in “table”.
|
||||
56: Stray end tag “g”.
|
||||
68: Stray end tag “g”.
|
||||
71: HTML start tag “p” in a foreign namespace context.
|
||||
71: Start tag “p” seen in “table”.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
| <table>
|
||||
| <colgroup>
|
||||
| <p>
|
||||
| "quux"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><tr><td><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
|
||||
#errors
|
||||
50: Stray “svg” start tag.
|
||||
54: Stray “g” start tag.
|
||||
62: Stray end tag “g”
|
||||
66: Stray “g” start tag.
|
||||
74: Stray end tag “g”
|
||||
77: Stray “p” start tag.
|
||||
88: “table” end tag with “select” open.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <select>
|
||||
| "foobarbaz"
|
||||
| <p>
|
||||
| "quux"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><table><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
|
||||
#errors
|
||||
36: Start tag “select” seen in “table”.
|
||||
42: Stray “svg” start tag.
|
||||
46: Stray “g” start tag.
|
||||
54: Stray end tag “g”
|
||||
58: Stray “g” start tag.
|
||||
66: Stray end tag “g”
|
||||
69: Stray “p” start tag.
|
||||
80: “table” end tag with “select” open.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| "foobarbaz"
|
||||
| <table>
|
||||
| <p>
|
||||
| "quux"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body></body></html><svg><g>foo</g><g>bar</g><p>baz
|
||||
#errors
|
||||
41: Stray “svg” start tag.
|
||||
68: HTML start tag “p” in a foreign namespace context.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body></body><svg><g>foo</g><g>bar</g><p>baz
|
||||
#errors
|
||||
34: Stray “svg” start tag.
|
||||
61: HTML start tag “p” in a foreign namespace context.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| "foo"
|
||||
| <svg g>
|
||||
| "bar"
|
||||
| <p>
|
||||
| "baz"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><frameset><svg><g></g><g></g><p><span>
|
||||
#errors
|
||||
31: Stray “svg” start tag.
|
||||
35: Stray “g” start tag.
|
||||
40: Stray end tag “g”
|
||||
44: Stray “g” start tag.
|
||||
49: Stray end tag “g”
|
||||
52: Stray “p” start tag.
|
||||
58: Stray “span” start tag.
|
||||
58: End of file seen and there were open elements.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><frameset></frameset><svg><g></g><g></g><p><span>
|
||||
#errors
|
||||
42: Stray “svg” start tag.
|
||||
46: Stray “g” start tag.
|
||||
51: Stray end tag “g”
|
||||
55: Stray “g” start tag.
|
||||
60: Stray end tag “g”
|
||||
63: Stray “p” start tag.
|
||||
69: Stray “span” start tag.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body xlink:href=foo><svg xlink:href=foo></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| xlink:href="foo"
|
||||
| <svg svg>
|
||||
| xlink href="foo"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo></g></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| xlink:href="foo"
|
||||
| xml:lang="en"
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| xlink href="foo"
|
||||
| xml lang="en"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo /></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| xlink:href="foo"
|
||||
| xml:lang="en"
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| xlink href="foo"
|
||||
| xml lang="en"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo />bar</svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| xlink:href="foo"
|
||||
| xml:lang="en"
|
||||
| <svg svg>
|
||||
| <svg g>
|
||||
| xlink href="foo"
|
||||
| xml lang="en"
|
||||
| "bar"
|
||||
|
||||
#data
|
||||
<svg></path>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<div><svg></div>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<div><svg><path></div>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <svg path>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<div><svg><path></svg><path>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <svg path>
|
||||
| <path>
|
||||
|
||||
#data
|
||||
<div><svg><path><foreignObject><math></div>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <svg path>
|
||||
| <svg foreignObject>
|
||||
| <math math>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<div><svg><path><foreignObject><p></div>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <svg path>
|
||||
| <svg foreignObject>
|
||||
| <p>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><svg><desc><div><svg><ul>a
|
||||
#errors
|
||||
40: HTML start tag “ul” in a foreign namespace context.
|
||||
41: End of file in a foreign namespace context.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <ul>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><svg><desc><svg><ul>a
|
||||
#errors
|
||||
35: HTML start tag “ul” in a foreign namespace context.
|
||||
36: End of file in a foreign namespace context.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <svg svg>
|
||||
| <ul>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><p><svg><desc><p>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><p><svg><title><p>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <svg svg>
|
||||
| <svg title>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<div><svg><path><foreignObject><p></foreignObject><p>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <svg svg>
|
||||
| <svg path>
|
||||
| <svg foreignObject>
|
||||
| <p>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<math><mi><div><object><div><span></span></div></object></div></mi><mi>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <div>
|
||||
| <object>
|
||||
| <div>
|
||||
| <span>
|
||||
| <math mi>
|
||||
|
||||
#data
|
||||
<math><mi><svg><foreignObject><div><div></div></div></foreignObject></svg></mi><mi>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <svg svg>
|
||||
| <svg foreignObject>
|
||||
| <div>
|
||||
| <div>
|
||||
| <math mi>
|
||||
|
||||
#data
|
||||
<svg><script></script><path>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg script>
|
||||
| <svg path>
|
||||
|
||||
#data
|
||||
<table><svg></svg><tr>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<math><mi><mglyph>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <math mglyph>
|
||||
|
||||
#data
|
||||
<math><mi><malignmark>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <math malignmark>
|
||||
|
||||
#data
|
||||
<math><mo><mglyph>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mo>
|
||||
| <math mglyph>
|
||||
|
||||
#data
|
||||
<math><mo><malignmark>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mo>
|
||||
| <math malignmark>
|
||||
|
||||
#data
|
||||
<math><mn><mglyph>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mn>
|
||||
| <math mglyph>
|
||||
|
||||
#data
|
||||
<math><mn><malignmark>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mn>
|
||||
| <math malignmark>
|
||||
|
||||
#data
|
||||
<math><ms><mglyph>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math ms>
|
||||
| <math mglyph>
|
||||
|
||||
#data
|
||||
<math><ms><malignmark>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math ms>
|
||||
| <math malignmark>
|
||||
|
||||
#data
|
||||
<math><mtext><mglyph>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mtext>
|
||||
| <math mglyph>
|
||||
|
||||
#data
|
||||
<math><mtext><malignmark>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math mtext>
|
||||
| <math malignmark>
|
||||
|
||||
#data
|
||||
<math><annotation-xml><svg></svg></annotation-xml><mi>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| <svg svg>
|
||||
| <math mi>
|
||||
|
||||
#data
|
||||
<math><annotation-xml><svg><foreignObject><div><math><mi></mi></math><span></span></div></foreignObject><path></path></svg></annotation-xml><mi>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| <svg svg>
|
||||
| <svg foreignObject>
|
||||
| <div>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <span>
|
||||
| <svg path>
|
||||
| <math mi>
|
||||
|
||||
#data
|
||||
<math><annotation-xml><svg><foreignObject><math><mi><svg></svg></mi><mo></mo></math><span></span></foreignObject><path></path></svg></annotation-xml><mi>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| <svg svg>
|
||||
| <svg foreignObject>
|
||||
| <math math>
|
||||
| <math mi>
|
||||
| <svg svg>
|
||||
| <math mo>
|
||||
| <span>
|
||||
| <svg path>
|
||||
| <math mi>
|
||||
482
html5lib/tests/testdata/tree-construction/tests11.dat
vendored
Normal file
482
html5lib/tests/testdata/tree-construction/tests11.dat
vendored
Normal file
@@ -0,0 +1,482 @@
|
||||
#data
|
||||
<!DOCTYPE html><body><svg attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' contentScriptType='' contentStyleType='' diffuseConstant='' edgeMode='' externalResourcesRequired='' filterRes='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| attributeName=""
|
||||
| attributeType=""
|
||||
| baseFrequency=""
|
||||
| baseProfile=""
|
||||
| calcMode=""
|
||||
| clipPathUnits=""
|
||||
| contentScriptType=""
|
||||
| contentStyleType=""
|
||||
| diffuseConstant=""
|
||||
| edgeMode=""
|
||||
| externalResourcesRequired=""
|
||||
| filterRes=""
|
||||
| filterUnits=""
|
||||
| glyphRef=""
|
||||
| gradientTransform=""
|
||||
| gradientUnits=""
|
||||
| kernelMatrix=""
|
||||
| kernelUnitLength=""
|
||||
| keyPoints=""
|
||||
| keySplines=""
|
||||
| keyTimes=""
|
||||
| lengthAdjust=""
|
||||
| limitingConeAngle=""
|
||||
| markerHeight=""
|
||||
| markerUnits=""
|
||||
| markerWidth=""
|
||||
| maskContentUnits=""
|
||||
| maskUnits=""
|
||||
| numOctaves=""
|
||||
| pathLength=""
|
||||
| patternContentUnits=""
|
||||
| patternTransform=""
|
||||
| patternUnits=""
|
||||
| pointsAtX=""
|
||||
| pointsAtY=""
|
||||
| pointsAtZ=""
|
||||
| preserveAlpha=""
|
||||
| preserveAspectRatio=""
|
||||
| primitiveUnits=""
|
||||
| refX=""
|
||||
| refY=""
|
||||
| repeatCount=""
|
||||
| repeatDur=""
|
||||
| requiredExtensions=""
|
||||
| requiredFeatures=""
|
||||
| specularConstant=""
|
||||
| specularExponent=""
|
||||
| spreadMethod=""
|
||||
| startOffset=""
|
||||
| stdDeviation=""
|
||||
| stitchTiles=""
|
||||
| surfaceScale=""
|
||||
| systemLanguage=""
|
||||
| tableValues=""
|
||||
| targetX=""
|
||||
| targetY=""
|
||||
| textLength=""
|
||||
| viewBox=""
|
||||
| viewTarget=""
|
||||
| xChannelSelector=""
|
||||
| yChannelSelector=""
|
||||
| zoomAndPan=""
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><BODY><SVG ATTRIBUTENAME='' ATTRIBUTETYPE='' BASEFREQUENCY='' BASEPROFILE='' CALCMODE='' CLIPPATHUNITS='' CONTENTSCRIPTTYPE='' CONTENTSTYLETYPE='' DIFFUSECONSTANT='' EDGEMODE='' EXTERNALRESOURCESREQUIRED='' FILTERRES='' FILTERUNITS='' GLYPHREF='' GRADIENTTRANSFORM='' GRADIENTUNITS='' KERNELMATRIX='' KERNELUNITLENGTH='' KEYPOINTS='' KEYSPLINES='' KEYTIMES='' LENGTHADJUST='' LIMITINGCONEANGLE='' MARKERHEIGHT='' MARKERUNITS='' MARKERWIDTH='' MASKCONTENTUNITS='' MASKUNITS='' NUMOCTAVES='' PATHLENGTH='' PATTERNCONTENTUNITS='' PATTERNTRANSFORM='' PATTERNUNITS='' POINTSATX='' POINTSATY='' POINTSATZ='' PRESERVEALPHA='' PRESERVEASPECTRATIO='' PRIMITIVEUNITS='' REFX='' REFY='' REPEATCOUNT='' REPEATDUR='' REQUIREDEXTENSIONS='' REQUIREDFEATURES='' SPECULARCONSTANT='' SPECULAREXPONENT='' SPREADMETHOD='' STARTOFFSET='' STDDEVIATION='' STITCHTILES='' SURFACESCALE='' SYSTEMLANGUAGE='' TABLEVALUES='' TARGETX='' TARGETY='' TEXTLENGTH='' VIEWBOX='' VIEWTARGET='' XCHANNELSELECTOR='' YCHANNELSELECTOR='' ZOOMANDPAN=''></SVG>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| attributeName=""
|
||||
| attributeType=""
|
||||
| baseFrequency=""
|
||||
| baseProfile=""
|
||||
| calcMode=""
|
||||
| clipPathUnits=""
|
||||
| contentScriptType=""
|
||||
| contentStyleType=""
|
||||
| diffuseConstant=""
|
||||
| edgeMode=""
|
||||
| externalResourcesRequired=""
|
||||
| filterRes=""
|
||||
| filterUnits=""
|
||||
| glyphRef=""
|
||||
| gradientTransform=""
|
||||
| gradientUnits=""
|
||||
| kernelMatrix=""
|
||||
| kernelUnitLength=""
|
||||
| keyPoints=""
|
||||
| keySplines=""
|
||||
| keyTimes=""
|
||||
| lengthAdjust=""
|
||||
| limitingConeAngle=""
|
||||
| markerHeight=""
|
||||
| markerUnits=""
|
||||
| markerWidth=""
|
||||
| maskContentUnits=""
|
||||
| maskUnits=""
|
||||
| numOctaves=""
|
||||
| pathLength=""
|
||||
| patternContentUnits=""
|
||||
| patternTransform=""
|
||||
| patternUnits=""
|
||||
| pointsAtX=""
|
||||
| pointsAtY=""
|
||||
| pointsAtZ=""
|
||||
| preserveAlpha=""
|
||||
| preserveAspectRatio=""
|
||||
| primitiveUnits=""
|
||||
| refX=""
|
||||
| refY=""
|
||||
| repeatCount=""
|
||||
| repeatDur=""
|
||||
| requiredExtensions=""
|
||||
| requiredFeatures=""
|
||||
| specularConstant=""
|
||||
| specularExponent=""
|
||||
| spreadMethod=""
|
||||
| startOffset=""
|
||||
| stdDeviation=""
|
||||
| stitchTiles=""
|
||||
| surfaceScale=""
|
||||
| systemLanguage=""
|
||||
| tableValues=""
|
||||
| targetX=""
|
||||
| targetY=""
|
||||
| textLength=""
|
||||
| viewBox=""
|
||||
| viewTarget=""
|
||||
| xChannelSelector=""
|
||||
| yChannelSelector=""
|
||||
| zoomAndPan=""
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><svg attributename='' attributetype='' basefrequency='' baseprofile='' calcmode='' clippathunits='' contentscripttype='' contentstyletype='' diffuseconstant='' edgemode='' externalresourcesrequired='' filterres='' filterunits='' glyphref='' gradienttransform='' gradientunits='' kernelmatrix='' kernelunitlength='' keypoints='' keysplines='' keytimes='' lengthadjust='' limitingconeangle='' markerheight='' markerunits='' markerwidth='' maskcontentunits='' maskunits='' numoctaves='' pathlength='' patterncontentunits='' patterntransform='' patternunits='' pointsatx='' pointsaty='' pointsatz='' preservealpha='' preserveaspectratio='' primitiveunits='' refx='' refy='' repeatcount='' repeatdur='' requiredextensions='' requiredfeatures='' specularconstant='' specularexponent='' spreadmethod='' startoffset='' stddeviation='' stitchtiles='' surfacescale='' systemlanguage='' tablevalues='' targetx='' targety='' textlength='' viewbox='' viewtarget='' xchannelselector='' ychannelselector='' zoomandpan=''></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| attributeName=""
|
||||
| attributeType=""
|
||||
| baseFrequency=""
|
||||
| baseProfile=""
|
||||
| calcMode=""
|
||||
| clipPathUnits=""
|
||||
| contentScriptType=""
|
||||
| contentStyleType=""
|
||||
| diffuseConstant=""
|
||||
| edgeMode=""
|
||||
| externalResourcesRequired=""
|
||||
| filterRes=""
|
||||
| filterUnits=""
|
||||
| glyphRef=""
|
||||
| gradientTransform=""
|
||||
| gradientUnits=""
|
||||
| kernelMatrix=""
|
||||
| kernelUnitLength=""
|
||||
| keyPoints=""
|
||||
| keySplines=""
|
||||
| keyTimes=""
|
||||
| lengthAdjust=""
|
||||
| limitingConeAngle=""
|
||||
| markerHeight=""
|
||||
| markerUnits=""
|
||||
| markerWidth=""
|
||||
| maskContentUnits=""
|
||||
| maskUnits=""
|
||||
| numOctaves=""
|
||||
| pathLength=""
|
||||
| patternContentUnits=""
|
||||
| patternTransform=""
|
||||
| patternUnits=""
|
||||
| pointsAtX=""
|
||||
| pointsAtY=""
|
||||
| pointsAtZ=""
|
||||
| preserveAlpha=""
|
||||
| preserveAspectRatio=""
|
||||
| primitiveUnits=""
|
||||
| refX=""
|
||||
| refY=""
|
||||
| repeatCount=""
|
||||
| repeatDur=""
|
||||
| requiredExtensions=""
|
||||
| requiredFeatures=""
|
||||
| specularConstant=""
|
||||
| specularExponent=""
|
||||
| spreadMethod=""
|
||||
| startOffset=""
|
||||
| stdDeviation=""
|
||||
| stitchTiles=""
|
||||
| surfaceScale=""
|
||||
| systemLanguage=""
|
||||
| tableValues=""
|
||||
| targetX=""
|
||||
| targetY=""
|
||||
| textLength=""
|
||||
| viewBox=""
|
||||
| viewTarget=""
|
||||
| xChannelSelector=""
|
||||
| yChannelSelector=""
|
||||
| zoomAndPan=""
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><math attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' contentScriptType='' contentStyleType='' diffuseConstant='' edgeMode='' externalResourcesRequired='' filterRes='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></math>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| attributename=""
|
||||
| attributetype=""
|
||||
| basefrequency=""
|
||||
| baseprofile=""
|
||||
| calcmode=""
|
||||
| clippathunits=""
|
||||
| contentscripttype=""
|
||||
| contentstyletype=""
|
||||
| diffuseconstant=""
|
||||
| edgemode=""
|
||||
| externalresourcesrequired=""
|
||||
| filterres=""
|
||||
| filterunits=""
|
||||
| glyphref=""
|
||||
| gradienttransform=""
|
||||
| gradientunits=""
|
||||
| kernelmatrix=""
|
||||
| kernelunitlength=""
|
||||
| keypoints=""
|
||||
| keysplines=""
|
||||
| keytimes=""
|
||||
| lengthadjust=""
|
||||
| limitingconeangle=""
|
||||
| markerheight=""
|
||||
| markerunits=""
|
||||
| markerwidth=""
|
||||
| maskcontentunits=""
|
||||
| maskunits=""
|
||||
| numoctaves=""
|
||||
| pathlength=""
|
||||
| patterncontentunits=""
|
||||
| patterntransform=""
|
||||
| patternunits=""
|
||||
| pointsatx=""
|
||||
| pointsaty=""
|
||||
| pointsatz=""
|
||||
| preservealpha=""
|
||||
| preserveaspectratio=""
|
||||
| primitiveunits=""
|
||||
| refx=""
|
||||
| refy=""
|
||||
| repeatcount=""
|
||||
| repeatdur=""
|
||||
| requiredextensions=""
|
||||
| requiredfeatures=""
|
||||
| specularconstant=""
|
||||
| specularexponent=""
|
||||
| spreadmethod=""
|
||||
| startoffset=""
|
||||
| stddeviation=""
|
||||
| stitchtiles=""
|
||||
| surfacescale=""
|
||||
| systemlanguage=""
|
||||
| tablevalues=""
|
||||
| targetx=""
|
||||
| targety=""
|
||||
| textlength=""
|
||||
| viewbox=""
|
||||
| viewtarget=""
|
||||
| xchannelselector=""
|
||||
| ychannelselector=""
|
||||
| zoomandpan=""
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><svg><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg altGlyph>
|
||||
| <svg altGlyphDef>
|
||||
| <svg altGlyphItem>
|
||||
| <svg animateColor>
|
||||
| <svg animateMotion>
|
||||
| <svg animateTransform>
|
||||
| <svg clipPath>
|
||||
| <svg feBlend>
|
||||
| <svg feColorMatrix>
|
||||
| <svg feComponentTransfer>
|
||||
| <svg feComposite>
|
||||
| <svg feConvolveMatrix>
|
||||
| <svg feDiffuseLighting>
|
||||
| <svg feDisplacementMap>
|
||||
| <svg feDistantLight>
|
||||
| <svg feFlood>
|
||||
| <svg feFuncA>
|
||||
| <svg feFuncB>
|
||||
| <svg feFuncG>
|
||||
| <svg feFuncR>
|
||||
| <svg feGaussianBlur>
|
||||
| <svg feImage>
|
||||
| <svg feMerge>
|
||||
| <svg feMergeNode>
|
||||
| <svg feMorphology>
|
||||
| <svg feOffset>
|
||||
| <svg fePointLight>
|
||||
| <svg feSpecularLighting>
|
||||
| <svg feSpotLight>
|
||||
| <svg feTile>
|
||||
| <svg feTurbulence>
|
||||
| <svg foreignObject>
|
||||
| <svg glyphRef>
|
||||
| <svg linearGradient>
|
||||
| <svg radialGradient>
|
||||
| <svg textPath>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><svg><altglyph /><altglyphdef /><altglyphitem /><animatecolor /><animatemotion /><animatetransform /><clippath /><feblend /><fecolormatrix /><fecomponenttransfer /><fecomposite /><feconvolvematrix /><fediffuselighting /><fedisplacementmap /><fedistantlight /><feflood /><fefunca /><fefuncb /><fefuncg /><fefuncr /><fegaussianblur /><feimage /><femerge /><femergenode /><femorphology /><feoffset /><fepointlight /><fespecularlighting /><fespotlight /><fetile /><feturbulence /><foreignobject /><glyphref /><lineargradient /><radialgradient /><textpath /></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg altGlyph>
|
||||
| <svg altGlyphDef>
|
||||
| <svg altGlyphItem>
|
||||
| <svg animateColor>
|
||||
| <svg animateMotion>
|
||||
| <svg animateTransform>
|
||||
| <svg clipPath>
|
||||
| <svg feBlend>
|
||||
| <svg feColorMatrix>
|
||||
| <svg feComponentTransfer>
|
||||
| <svg feComposite>
|
||||
| <svg feConvolveMatrix>
|
||||
| <svg feDiffuseLighting>
|
||||
| <svg feDisplacementMap>
|
||||
| <svg feDistantLight>
|
||||
| <svg feFlood>
|
||||
| <svg feFuncA>
|
||||
| <svg feFuncB>
|
||||
| <svg feFuncG>
|
||||
| <svg feFuncR>
|
||||
| <svg feGaussianBlur>
|
||||
| <svg feImage>
|
||||
| <svg feMerge>
|
||||
| <svg feMergeNode>
|
||||
| <svg feMorphology>
|
||||
| <svg feOffset>
|
||||
| <svg fePointLight>
|
||||
| <svg feSpecularLighting>
|
||||
| <svg feSpotLight>
|
||||
| <svg feTile>
|
||||
| <svg feTurbulence>
|
||||
| <svg foreignObject>
|
||||
| <svg glyphRef>
|
||||
| <svg linearGradient>
|
||||
| <svg radialGradient>
|
||||
| <svg textPath>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><BODY><SVG><ALTGLYPH /><ALTGLYPHDEF /><ALTGLYPHITEM /><ANIMATECOLOR /><ANIMATEMOTION /><ANIMATETRANSFORM /><CLIPPATH /><FEBLEND /><FECOLORMATRIX /><FECOMPONENTTRANSFER /><FECOMPOSITE /><FECONVOLVEMATRIX /><FEDIFFUSELIGHTING /><FEDISPLACEMENTMAP /><FEDISTANTLIGHT /><FEFLOOD /><FEFUNCA /><FEFUNCB /><FEFUNCG /><FEFUNCR /><FEGAUSSIANBLUR /><FEIMAGE /><FEMERGE /><FEMERGENODE /><FEMORPHOLOGY /><FEOFFSET /><FEPOINTLIGHT /><FESPECULARLIGHTING /><FESPOTLIGHT /><FETILE /><FETURBULENCE /><FOREIGNOBJECT /><GLYPHREF /><LINEARGRADIENT /><RADIALGRADIENT /><TEXTPATH /></SVG>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg altGlyph>
|
||||
| <svg altGlyphDef>
|
||||
| <svg altGlyphItem>
|
||||
| <svg animateColor>
|
||||
| <svg animateMotion>
|
||||
| <svg animateTransform>
|
||||
| <svg clipPath>
|
||||
| <svg feBlend>
|
||||
| <svg feColorMatrix>
|
||||
| <svg feComponentTransfer>
|
||||
| <svg feComposite>
|
||||
| <svg feConvolveMatrix>
|
||||
| <svg feDiffuseLighting>
|
||||
| <svg feDisplacementMap>
|
||||
| <svg feDistantLight>
|
||||
| <svg feFlood>
|
||||
| <svg feFuncA>
|
||||
| <svg feFuncB>
|
||||
| <svg feFuncG>
|
||||
| <svg feFuncR>
|
||||
| <svg feGaussianBlur>
|
||||
| <svg feImage>
|
||||
| <svg feMerge>
|
||||
| <svg feMergeNode>
|
||||
| <svg feMorphology>
|
||||
| <svg feOffset>
|
||||
| <svg fePointLight>
|
||||
| <svg feSpecularLighting>
|
||||
| <svg feSpotLight>
|
||||
| <svg feTile>
|
||||
| <svg feTurbulence>
|
||||
| <svg foreignObject>
|
||||
| <svg glyphRef>
|
||||
| <svg linearGradient>
|
||||
| <svg radialGradient>
|
||||
| <svg textPath>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><math><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></math>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math altglyph>
|
||||
| <math altglyphdef>
|
||||
| <math altglyphitem>
|
||||
| <math animatecolor>
|
||||
| <math animatemotion>
|
||||
| <math animatetransform>
|
||||
| <math clippath>
|
||||
| <math feblend>
|
||||
| <math fecolormatrix>
|
||||
| <math fecomponenttransfer>
|
||||
| <math fecomposite>
|
||||
| <math feconvolvematrix>
|
||||
| <math fediffuselighting>
|
||||
| <math fedisplacementmap>
|
||||
| <math fedistantlight>
|
||||
| <math feflood>
|
||||
| <math fefunca>
|
||||
| <math fefuncb>
|
||||
| <math fefuncg>
|
||||
| <math fefuncr>
|
||||
| <math fegaussianblur>
|
||||
| <math feimage>
|
||||
| <math femerge>
|
||||
| <math femergenode>
|
||||
| <math femorphology>
|
||||
| <math feoffset>
|
||||
| <math fepointlight>
|
||||
| <math fespecularlighting>
|
||||
| <math fespotlight>
|
||||
| <math fetile>
|
||||
| <math feturbulence>
|
||||
| <math foreignobject>
|
||||
| <math glyphref>
|
||||
| <math lineargradient>
|
||||
| <math radialgradient>
|
||||
| <math textpath>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><svg><solidColor /></svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg solidcolor>
|
||||
62
html5lib/tests/testdata/tree-construction/tests12.dat
vendored
Normal file
62
html5lib/tests/testdata/tree-construction/tests12.dat
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
#data
|
||||
<!DOCTYPE html><body><p>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| "foo"
|
||||
| <math math>
|
||||
| <math mtext>
|
||||
| <i>
|
||||
| "baz"
|
||||
| <math annotation-xml>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <b>
|
||||
| "eggs"
|
||||
| <svg g>
|
||||
| <svg foreignObject>
|
||||
| <p>
|
||||
| "spam"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <img>
|
||||
| <svg g>
|
||||
| "quux"
|
||||
| "bar"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "foo"
|
||||
| <math math>
|
||||
| <math mtext>
|
||||
| <i>
|
||||
| "baz"
|
||||
| <math annotation-xml>
|
||||
| <svg svg>
|
||||
| <svg desc>
|
||||
| <b>
|
||||
| "eggs"
|
||||
| <svg g>
|
||||
| <svg foreignObject>
|
||||
| <p>
|
||||
| "spam"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <img>
|
||||
| <svg g>
|
||||
| "quux"
|
||||
| "bar"
|
||||
74
html5lib/tests/testdata/tree-construction/tests14.dat
vendored
Normal file
74
html5lib/tests/testdata/tree-construction/tests14.dat
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
#data
|
||||
<!DOCTYPE html><html><body><xyz:abc></xyz:abc>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <xyz:abc>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html><body><xyz:abc></xyz:abc><span></span>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <xyz:abc>
|
||||
| <span>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html><html abc:def=gh><xyz:abc></xyz:abc>
|
||||
#errors
|
||||
15: Unexpected start tag html
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| abc:def="gh"
|
||||
| <head>
|
||||
| <body>
|
||||
| <xyz:abc>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html xml:lang=bar><html xml:lang=foo>
|
||||
#errors
|
||||
15: Unexpected start tag html
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| xml:lang="bar"
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html 123=456>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| 123="456"
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html 123=456><html 789=012>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| 123="456"
|
||||
| 789="012"
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html><body 789=012>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| 789="012"
|
||||
208
html5lib/tests/testdata/tree-construction/tests15.dat
vendored
Normal file
208
html5lib/tests/testdata/tree-construction/tests15.dat
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
#data
|
||||
<!DOCTYPE html><p><b><i><u></p> <p>X
|
||||
#errors
|
||||
Line: 1 Col: 31 Unexpected end tag (p). Ignored.
|
||||
Line: 1 Col: 36 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <b>
|
||||
| <i>
|
||||
| <u>
|
||||
| <b>
|
||||
| <i>
|
||||
| <u>
|
||||
| " "
|
||||
| <p>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<p><b><i><u></p>
|
||||
<p>X
|
||||
#errors
|
||||
Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE.
|
||||
Line: 1 Col: 16 Unexpected end tag (p). Ignored.
|
||||
Line: 2 Col: 4 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <b>
|
||||
| <i>
|
||||
| <u>
|
||||
| <b>
|
||||
| <i>
|
||||
| <u>
|
||||
| "
|
||||
"
|
||||
| <p>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<!doctype html></html> <head>
|
||||
#errors
|
||||
Line: 1 Col: 22 Unexpected end tag (html) after the (implied) root element.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| " "
|
||||
|
||||
#data
|
||||
<!doctype html></body><meta>
|
||||
#errors
|
||||
Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <meta>
|
||||
|
||||
#data
|
||||
<html></html><!-- foo -->
|
||||
#errors
|
||||
Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE.
|
||||
Line: 1 Col: 13 Unexpected end tag (html) after the (implied) root element.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <!-- foo -->
|
||||
|
||||
#data
|
||||
<!doctype html></body><title>X</title>
|
||||
#errors
|
||||
Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <title>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<!doctype html><table> X<meta></table>
|
||||
#errors
|
||||
Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode.
|
||||
Line: 1 Col: 30 Unexpected start tag (meta) in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| " X"
|
||||
| <meta>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table> x</table>
|
||||
#errors
|
||||
Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| " x"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table> x </table>
|
||||
#errors
|
||||
Line: 1 Col: 25 Unexpected non-space characters in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| " x "
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr> x</table>
|
||||
#errors
|
||||
Line: 1 Col: 28 Unexpected non-space characters in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| " x"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><table>X<style> <tr>x </style> </table>
|
||||
#errors
|
||||
Line: 1 Col: 23 Unexpected non-space characters in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "X"
|
||||
| <table>
|
||||
| <style>
|
||||
| " <tr>x "
|
||||
| " "
|
||||
|
||||
#data
|
||||
<!doctype html><div><table><a>foo</a> <tr><td>bar</td> </tr></table></div>
|
||||
#errors
|
||||
Line: 1 Col: 30 Unexpected start tag (a) in table context caused voodoo mode.
|
||||
Line: 1 Col: 37 Unexpected end tag (a) in table context caused voodoo mode.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <a>
|
||||
| "foo"
|
||||
| <table>
|
||||
| " "
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "bar"
|
||||
| " "
|
||||
|
||||
#data
|
||||
<frame></frame></frame><frameset><frame><frameset><frame></frameset><noframes></frameset><noframes>
|
||||
#errors
|
||||
6: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
|
||||
13: Stray start tag “frame”.
|
||||
21: Stray end tag “frame”.
|
||||
29: Stray end tag “frame”.
|
||||
39: “frameset” start tag after “body” already open.
|
||||
105: End of file seen inside an [R]CDATA element.
|
||||
105: End of file seen and there were open elements.
|
||||
XXX: These errors are wrong, please fix me!
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
| <frame>
|
||||
| <frameset>
|
||||
| <frame>
|
||||
| <noframes>
|
||||
| "</frameset><noframes>"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><object></html>
|
||||
#errors
|
||||
1: Expected closing tag. Unexpected end of file
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <object>
|
||||
2299
html5lib/tests/testdata/tree-construction/tests16.dat
vendored
Normal file
2299
html5lib/tests/testdata/tree-construction/tests16.dat
vendored
Normal file
File diff suppressed because it is too large
Load Diff
153
html5lib/tests/testdata/tree-construction/tests17.dat
vendored
Normal file
153
html5lib/tests/testdata/tree-construction/tests17.dat
vendored
Normal file
@@ -0,0 +1,153 @@
|
||||
#data
|
||||
<!doctype html><table><tbody><select><tr>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><select><td>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><td><select><td>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <select>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><th><select><td>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <th>
|
||||
| <select>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<!doctype html><table><caption><select><tr>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <select>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><select><tr>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><td>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><th>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><tbody>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><thead>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><tfoot>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><select><caption>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr></table>a
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| "a"
|
||||
269
html5lib/tests/testdata/tree-construction/tests18.dat
vendored
Normal file
269
html5lib/tests/testdata/tree-construction/tests18.dat
vendored
Normal file
@@ -0,0 +1,269 @@
|
||||
#data
|
||||
<!doctype html><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
|
||||
#data
|
||||
<!doctype html><table><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tbody><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
| <table>
|
||||
| <tbody>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tbody><tr><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tbody><tr><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><table><td><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
|
||||
#data
|
||||
<!doctype html><table><caption><plaintext></plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><style></script></style>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "abc"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <style>
|
||||
| "</script>"
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><script></style></script>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "abc"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <script>
|
||||
| "</style>"
|
||||
|
||||
#data
|
||||
<!doctype html><table><caption><style></script></style>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| <style>
|
||||
| "</script>"
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<!doctype html><table><td><style></script></style>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <style>
|
||||
| "</script>"
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<!doctype html><select><script></style></script>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <script>
|
||||
| "</style>"
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<!doctype html><table><select><script></style></script>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <script>
|
||||
| "</style>"
|
||||
| "abc"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr><select><script></style></script>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <script>
|
||||
| "</style>"
|
||||
| "abc"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!doctype html><frameset></frameset><noframes>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
| <noframes>
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<!doctype html><frameset></frameset><noframes>abc</noframes><!--abc-->
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
| <noframes>
|
||||
| "abc"
|
||||
| <!-- abc -->
|
||||
|
||||
#data
|
||||
<!doctype html><frameset></frameset></html><noframes>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
| <noframes>
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<!doctype html><frameset></frameset></html><noframes>abc</noframes><!--abc-->
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
| <noframes>
|
||||
| "abc"
|
||||
| <!-- abc -->
|
||||
|
||||
#data
|
||||
<!doctype html><table><tr></tbody><tfoot>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <tfoot>
|
||||
|
||||
#data
|
||||
<!doctype html><table><td><svg></svg>abc<td>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <svg svg>
|
||||
| "abc"
|
||||
| <td>
|
||||
1237
html5lib/tests/testdata/tree-construction/tests19.dat
vendored
Normal file
1237
html5lib/tests/testdata/tree-construction/tests19.dat
vendored
Normal file
File diff suppressed because it is too large
Load Diff
763
html5lib/tests/testdata/tree-construction/tests2.dat
vendored
Normal file
763
html5lib/tests/testdata/tree-construction/tests2.dat
vendored
Normal file
@@ -0,0 +1,763 @@
|
||||
#data
|
||||
<!DOCTYPE html>Test
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "Test"
|
||||
|
||||
#data
|
||||
<textarea>test</div>test
|
||||
#errors
|
||||
Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
|
||||
Line: 1 Col: 24 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <textarea>
|
||||
| "test</div>test"
|
||||
|
||||
#data
|
||||
<table><td>
|
||||
#errors
|
||||
Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
|
||||
Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase.
|
||||
Line: 1 Col: 11 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
|
||||
#data
|
||||
<table><td>test</tbody></table>
|
||||
#errors
|
||||
Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
|
||||
Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "test"
|
||||
|
||||
#data
|
||||
<frame>test
|
||||
#errors
|
||||
Line: 1 Col: 7 Unexpected start tag (frame). Expected DOCTYPE.
|
||||
Line: 1 Col: 7 Unexpected start tag frame. Ignored.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "test"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><frameset>test
|
||||
#errors
|
||||
Line: 1 Col: 29 Unepxected characters in the frameset phase. Characters ignored.
|
||||
Line: 1 Col: 29 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><frameset><!DOCTYPE html>
|
||||
#errors
|
||||
Line: 1 Col: 40 Unexpected DOCTYPE. Ignored.
|
||||
Line: 1 Col: 40 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <frameset>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><font><p><b>test</font>
|
||||
#errors
|
||||
Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm.
|
||||
Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <font>
|
||||
| <p>
|
||||
| <font>
|
||||
| <b>
|
||||
| "test"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><dt><div><dd>
|
||||
#errors
|
||||
Line: 1 Col: 28 Missing end tag (div, dt).
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <dt>
|
||||
| <div>
|
||||
| <dd>
|
||||
|
||||
#data
|
||||
<script></x
|
||||
#errors
|
||||
Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE.
|
||||
Line: 1 Col: 11 Unexpected end of file. Expected end tag (script).
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <script>
|
||||
| "</x"
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<table><plaintext><td>
|
||||
#errors
|
||||
Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
|
||||
Line: 1 Col: 18 Unexpected start tag (plaintext) in table context caused voodoo mode.
|
||||
Line: 1 Col: 22 Unexpected end of file. Expected table content.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "<td>"
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<plaintext></plaintext>
|
||||
#errors
|
||||
Line: 1 Col: 11 Unexpected start tag (plaintext). Expected DOCTYPE.
|
||||
Line: 1 Col: 23 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <plaintext>
|
||||
| "</plaintext>"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><table><tr>TEST
|
||||
#errors
|
||||
Line: 1 Col: 30 Unexpected non-space characters in table context caused voodoo mode.
|
||||
Line: 1 Col: 30 Unexpected end of file. Expected table content.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "TEST"
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body t1=1><body t2=2><body t3=3 t4=4>
|
||||
#errors
|
||||
Line: 1 Col: 37 Unexpected start tag (body).
|
||||
Line: 1 Col: 53 Unexpected start tag (body).
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| t1="1"
|
||||
| t2="2"
|
||||
| t3="3"
|
||||
| t4="4"
|
||||
|
||||
#data
|
||||
</b test
|
||||
#errors
|
||||
Line: 1 Col: 8 Unexpected end of file in attribute name.
|
||||
Line: 1 Col: 8 End tag contains unexpected attributes.
|
||||
Line: 1 Col: 8 Unexpected end tag (b). Expected DOCTYPE.
|
||||
Line: 1 Col: 8 Unexpected end tag (b) after the (implied) root element.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html></b test<b &=&>X
|
||||
#errors
|
||||
Line: 1 Col: 32 Named entity didn't end with ';'.
|
||||
Line: 1 Col: 33 End tag contains unexpected attributes.
|
||||
Line: 1 Col: 33 Unexpected end tag (b) after the (implied) root element.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt
|
||||
#errors
|
||||
Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
|
||||
Line: 1 Col: 54 Unexpected end of file in the tag name.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <script>
|
||||
| type="text/x-foobar;baz"
|
||||
| "X</SCRipt"
|
||||
| <body>
|
||||
|
||||
#data
|
||||
&
|
||||
#errors
|
||||
Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&"
|
||||
|
||||
#data
|
||||
&#
|
||||
#errors
|
||||
Line: 1 Col: 1 Numeric entity expected. Got end of file instead.
|
||||
Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&#"
|
||||
|
||||
#data
|
||||
&#X
|
||||
#errors
|
||||
Line: 1 Col: 3 Numeric entity expected but none found.
|
||||
Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&#X"
|
||||
|
||||
#data
|
||||
&#x
|
||||
#errors
|
||||
Line: 1 Col: 3 Numeric entity expected but none found.
|
||||
Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&#x"
|
||||
|
||||
#data
|
||||
-
|
||||
#errors
|
||||
Line: 1 Col: 4 Numeric entity didn't end with ';'.
|
||||
Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "-"
|
||||
|
||||
#data
|
||||
&x-test
|
||||
#errors
|
||||
Line: 1 Col: 1 Named entity expected. Got none.
|
||||
Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&x-test"
|
||||
|
||||
#data
|
||||
<!doctypehtml><p><li>
|
||||
#errors
|
||||
Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <li>
|
||||
|
||||
#data
|
||||
<!doctypehtml><p><dt>
|
||||
#errors
|
||||
Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <dt>
|
||||
|
||||
#data
|
||||
<!doctypehtml><p><dd>
|
||||
#errors
|
||||
Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <dd>
|
||||
|
||||
#data
|
||||
<!doctypehtml><p><form>
|
||||
#errors
|
||||
Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
|
||||
Line: 1 Col: 23 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <form>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><p></P>X
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
&
|
||||
#errors
|
||||
Line: 1 Col: 4 Named entity didn't end with ';'.
|
||||
Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&"
|
||||
|
||||
#data
|
||||
&AMp;
|
||||
#errors
|
||||
Line: 1 Col: 1 Named entity expected. Got none.
|
||||
Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "&AMp;"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY>
|
||||
#errors
|
||||
Line: 1 Col: 110 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X</body>X
|
||||
#errors
|
||||
Line: 1 Col: 24 Unexpected non-space characters in the after body phase.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "XX"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><!-- X
|
||||
#errors
|
||||
Line: 1 Col: 21 Unexpected end of file in comment.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <!-- X -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><table><caption>test TEST</caption><td>test
|
||||
#errors
|
||||
Line: 1 Col: 54 Unexpected table cell start tag (td) in the table body phase.
|
||||
Line: 1 Col: 58 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <caption>
|
||||
| "test TEST"
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| "test"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><select><option><optgroup>
|
||||
#errors
|
||||
Line: 1 Col: 41 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <option>
|
||||
| <optgroup>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><select><optgroup><option></optgroup><option><select><option>
|
||||
#errors
|
||||
Line: 1 Col: 68 Unexpected select start tag in the select phase treated as select end tag.
|
||||
Line: 1 Col: 76 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <optgroup>
|
||||
| <option>
|
||||
| <option>
|
||||
| <option>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><select><optgroup><option><optgroup>
|
||||
#errors
|
||||
Line: 1 Col: 51 Expected closing tag. Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <optgroup>
|
||||
| <option>
|
||||
| <optgroup>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><datalist><option>foo</datalist>bar
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <datalist>
|
||||
| <option>
|
||||
| "foo"
|
||||
| "bar"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><font><input><input></font>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <font>
|
||||
| <input>
|
||||
| <input>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><!-- XXX - XXX -->
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <!-- XXX - XXX -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><!-- XXX - XXX
|
||||
#errors
|
||||
Line: 1 Col: 29 Unexpected end of file in comment (-)
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <!-- XXX - XXX -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><!-- XXX - XXX - XXX -->
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <!-- XXX - XXX - XXX -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<isindex test=x name=x>
|
||||
#errors
|
||||
Line: 1 Col: 23 Unexpected start tag (isindex). Expected DOCTYPE.
|
||||
Line: 1 Col: 23 Unexpected start tag isindex. Don't use it!
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
| <hr>
|
||||
| <label>
|
||||
| "This is a searchable index. Enter search keywords: "
|
||||
| <input>
|
||||
| name="isindex"
|
||||
| test="x"
|
||||
| <hr>
|
||||
|
||||
#data
|
||||
test
|
||||
test
|
||||
#errors
|
||||
Line: 2 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "test
|
||||
test"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><title>test</body></title>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <title>
|
||||
| "test</body>"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><body><title>X</title><meta name=z><link rel=foo><style>
|
||||
x { content:"</style" } </style>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <title>
|
||||
| "X"
|
||||
| <meta>
|
||||
| name="z"
|
||||
| <link>
|
||||
| rel="foo"
|
||||
| <style>
|
||||
| "
|
||||
x { content:"</style" } "
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><select><optgroup></optgroup></select>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <select>
|
||||
| <optgroup>
|
||||
|
||||
#data
|
||||
|
||||
|
||||
#errors
|
||||
Line: 2 Col: 1 Unexpected End of file. Expected DOCTYPE.
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html> <html>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><script>
|
||||
</script> <title>x</title> </head>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <script>
|
||||
| "
|
||||
"
|
||||
| " "
|
||||
| <title>
|
||||
| "x"
|
||||
| " "
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><html><body><html id=x>
|
||||
#errors
|
||||
Line: 1 Col: 38 html needs to be the first start tag.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| id="x"
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X</body><html id="x">
|
||||
#errors
|
||||
Line: 1 Col: 36 Unexpected start tag token (html) in the after body phase.
|
||||
Line: 1 Col: 36 html needs to be the first start tag.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| id="x"
|
||||
| <head>
|
||||
| <body>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><head><html id=x>
|
||||
#errors
|
||||
Line: 1 Col: 32 html needs to be the first start tag.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| id="x"
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X</html>X
|
||||
#errors
|
||||
Line: 1 Col: 24 Unexpected non-space characters in the after body phase.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "XX"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X</html>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "X "
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X</html><p>X
|
||||
#errors
|
||||
Line: 1 Col: 26 Unexpected start tag (p).
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "X"
|
||||
| <p>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
<!DOCTYPE html>X<p/x/y/z>
|
||||
#errors
|
||||
Line: 1 Col: 19 Expected a > after the /.
|
||||
Line: 1 Col: 21 Solidus (/) incorrectly placed in tag.
|
||||
Line: 1 Col: 23 Solidus (/) incorrectly placed in tag.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| "X"
|
||||
| <p>
|
||||
| x=""
|
||||
| y=""
|
||||
| z=""
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><!--x--
|
||||
#errors
|
||||
Line: 1 Col: 22 Unexpected end of file in comment (--).
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <!-- x -->
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!DOCTYPE html><table><tr><td></p></table>
|
||||
#errors
|
||||
Line: 1 Col: 34 Unexpected end tag (p). Ignored.
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<!DOCTYPE <!DOCTYPE HTML>><!--<!--x-->-->
|
||||
#errors
|
||||
Line: 1 Col: 20 Expected space or '>'. Got ''
|
||||
Line: 1 Col: 25 Erroneous DOCTYPE.
|
||||
Line: 1 Col: 35 Unexpected character in comment found.
|
||||
#document
|
||||
| <!DOCTYPE <!doctype>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| ">"
|
||||
| <!-- <!--x -->
|
||||
| "-->"
|
||||
|
||||
#data
|
||||
<!doctype html><div><form></form><div></div></div>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <form>
|
||||
| <div>
|
||||
455
html5lib/tests/testdata/tree-construction/tests20.dat
vendored
Normal file
455
html5lib/tests/testdata/tree-construction/tests20.dat
vendored
Normal file
@@ -0,0 +1,455 @@
|
||||
#data
|
||||
<!doctype html><p><button><button>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <button>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><address>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <address>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><blockquote>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <blockquote>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><menu>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <menu>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><p>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><ul>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <ul>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><h1>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <h1>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><h6>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <h6>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><listing>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <listing>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><pre>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <pre>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><form>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <form>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><li>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <li>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><dd>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <dd>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><dt>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <dt>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><plaintext>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <plaintext>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><table>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><hr>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <hr>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button><xmp>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <xmp>
|
||||
|
||||
#data
|
||||
<!doctype html><p><button></p>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <button>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<!doctype html><address><button></address>a
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <address>
|
||||
| <button>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<!doctype html><address><button></address>a
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <address>
|
||||
| <button>
|
||||
| "a"
|
||||
|
||||
#data
|
||||
<p><table></p>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <p>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><svg>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<!doctype html><p><figcaption>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <figcaption>
|
||||
|
||||
#data
|
||||
<!doctype html><p><summary>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <p>
|
||||
| <summary>
|
||||
|
||||
#data
|
||||
<!doctype html><form><table><form>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
| <table>
|
||||
|
||||
#data
|
||||
<!doctype html><table><form><form>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <form>
|
||||
|
||||
#data
|
||||
<!doctype html><table><form></table><form>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <table>
|
||||
| <form>
|
||||
|
||||
#data
|
||||
<!doctype html><svg><foreignObject><p>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg foreignObject>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<!doctype html><svg><title>abc
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE html>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg title>
|
||||
| "abc"
|
||||
|
||||
#data
|
||||
<option><span><option>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <option>
|
||||
| <span>
|
||||
| <option>
|
||||
|
||||
#data
|
||||
<option><option>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <option>
|
||||
| <option>
|
||||
|
||||
#data
|
||||
<math><annotation-xml><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding="application/svg+xml"><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding="application/svg+xml"
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding="application/xhtml+xml"><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding="application/xhtml+xml"
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding="aPPlication/xhtmL+xMl"><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding="aPPlication/xhtmL+xMl"
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding="text/html"><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding="text/html"
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding="Text/htmL"><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding="Text/htmL"
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<math><annotation-xml encoding=" text/html "><div>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| <math annotation-xml>
|
||||
| encoding=" text/html "
|
||||
| <div>
|
||||
221
html5lib/tests/testdata/tree-construction/tests21.dat
vendored
Normal file
221
html5lib/tests/testdata/tree-construction/tests21.dat
vendored
Normal file
@@ -0,0 +1,221 @@
|
||||
#data
|
||||
<svg><![CDATA[foo]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<math><![CDATA[foo]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <math math>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<div><![CDATA[foo]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <div>
|
||||
| <!-- [CDATA[foo]] -->
|
||||
|
||||
#data
|
||||
<svg><![CDATA[foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[foo
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]] >]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "]] >"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]] >]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "]] >"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]]
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "]]"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "]"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[]>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "]>a"
|
||||
|
||||
#data
|
||||
<svg><foreignObject><div><![CDATA[foo]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| <svg foreignObject>
|
||||
| <div>
|
||||
| <!-- [CDATA[foo]] -->
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[</svg>a]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "</svg>a"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>a"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[</svg>a
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "</svg>a"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>]]><path>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>"
|
||||
| <svg path>
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>]]></path>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>]]><!--path-->
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>"
|
||||
| <!-- path -->
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<svg>]]>path
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<svg>path"
|
||||
|
||||
#data
|
||||
<svg><![CDATA[<!--svg-->]]>
|
||||
#errors
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <svg svg>
|
||||
| "<!--svg-->"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user