diff --git a/bs4/__init__.py b/bs4/__init__.py new file mode 100644 index 00000000..80f6f684 --- /dev/null +++ b/bs4/__init__.py @@ -0,0 +1,359 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides provides methods and Pythonic idioms that make it easy to +navigate, search, and modify the parse tree. + +Beautiful Soup works with Python 2.6 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.1.3" +__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import re +import warnings + +from .builder import builder_registry +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, **kwargs): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. You can pass in features='html' " + "or features='xml' to get a builder capable of handling " + "one or the other.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if len(kwargs) > 0: + arg = kwargs.keys().pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + + if builder is None: + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() + self.builder = builder + self.is_xml = builder.is_xml + self.builder.soup = self + + self.parse_only = parse_only + + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) = ( + self.builder.prepare_markup(markup, from_encoding)) + + try: + self._feed() + except StopParsing: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + """Create a new tag associated with this soup.""" + return Tag(None, self.builder, name, namespace, nsprefix, attrs) + + def new_string(self, s): + """Create a new NavigableString associated with this soup.""" + navigable = NavigableString(s) + navigable.setup() + return navigable + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.builder.preserve_whitespace_tags)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(currentData)): + return + o = containerClass(currentData) + self.object_was_parsed(o) + + def object_was_parsed(self, o): + """Add an object to the parse tree.""" + o.setup(self.currentTag, self.previous_element) + if self.previous_element: + self.previous_element.next_element = o + self.previous_element = o + self.currentTag.contents.append(o) + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + + for i in range(len(self.tagStack) - 1, 0, -1): + if (name == self.tagStack[i].name + and nsprefix == self.tagStack[i].prefix): + numPops = len(self.tagStack) - i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self.previous_element) + if tag is None: + return tag + if self.previous_element: + self.previous_element.next_element = tag + self.previous_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.currentData.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py new file mode 100644 index 00000000..dc7deb93 --- /dev/null +++ b/bs4/builder/__init__.py @@ -0,0 +1,316 @@ +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + whitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + features = [] + + is_xml = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + cdata_list_attributes = {} + + + def __init__(self): + self.soup = None + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), []) + for cdata_list_attr in itertools.chain(universal, tag_specific): + if cdata_list_attr in dict(attrs): + # Basically, we have a "class" attribute whose + # value is a whitespace-separated list of CSS + # classes. Split it into a list. + value = attrs[cdata_list_attr] + if isinstance(value, basestring): + values = whitespace_re.split(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[cdata_list_attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + preserve_whitespace_tags = set(['pre', 'textarea']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py new file mode 100644 index 00000000..6001e386 --- /dev/null +++ b/bs4/builder/_html5lib.py @@ -0,0 +1,222 @@ +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import NamespacedAttribute +import html5lib +from html5lib.constants import namespaces +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + features = ['html5lib', PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + return markup, None, None, False + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + doc = parser.parse(markup, encoding=self.user_specified_encoding) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment + + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # XXX This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + node.element) + old_element.replace_with(new_element) + else: + self.element.append(node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild( + Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild( + TextNode(child, self.soup)) + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 00000000..ede5cecb --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,244 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import ( + HTMLParser, + HTMLParseError, + ) +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = ( + major > 3 + or (major == 3 and minor > 2) + or (major == 3 and minor == 2 and release >= 3)) + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + def handle_starttag(self, name, attrs): + # XXX namespace + self.soup.handle_starttag(name, None, None, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed. + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + else: + real_name = int(name) + + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + data = u"\N{REPLACEMENT CHARACTER}" + + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + if data.endswith("?") and data.lower().startswith("xml"): + # "An XHTML processing instruction using the trailing '?' + # will cause the '?' to be included in data." - HTMLParser + # docs. + # + # Strip the question mark so we don't end up with two + # question marks. + data = data[:-1] + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py new file mode 100644 index 00000000..f6b91ff5 --- /dev/null +++ b/bs4/builder/_lxml.py @@ -0,0 +1,199 @@ +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from StringIO import StringIO +import collections +from lxml import etree +from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML) +from bs4.dammit import UnicodeDammit + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + @property + def default_parser(self): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + return etree.XMLParser(target=self, strip_cdata=False, recover=True) + + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + if parser is None: + # Use the default parser. + parser = self.default_parser + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False) + self.parser = parser + self.soup = None + self.nsmaps = None + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + if isinstance(markup, basestring): + markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) + while data != '': + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if data != '': + self.parser.feed(data) + self.parser.close() + + def close(self): + self.nsmaps = None + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and self.nsmaps != None: + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + if self.nsmaps is not None and len(self.nsmaps) > 0: + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if self.nsmaps != None: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. + self.nsmaps = None + + def pi(self, target, data): + pass + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + features = [LXML, HTML, FAST, PERMISSIVE] + is_xml = False + + @property + def default_parser(self): + return etree.HTMLParser + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/bs4/dammit.py b/bs4/dammit.py new file mode 100644 index 00000000..983ade0f --- /dev/null +++ b/bs4/dammit.py @@ -0,0 +1,803 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's the tree builder's job. +""" + +import codecs +from htmlentitydefs import codepoint2name +import re +import logging + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + for codepoint, name in list(codepoint2name.items()): + character = unichr(codepoint) + if codepoint != 34: + # There's no point in turning the quotation mark into + # ", unless it happens within an attribute value, which + # is handled elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to turn " into the quotation mark. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False): + self.declared_html_encoding = None + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + + if markup == '' or isinstance(markup, unicode): + self.markup = markup + self.unicode_markup = unicode(markup) + self.original_encoding = None + return + + new_markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, is_html) + self.markup = new_markup + + u = None + if new_markup != markup: + # _detectEncoding modified the markup, then converted it to + # Unicode and then to UTF-8. So convert it from UTF-8. + u = self._convert_from("utf8") + self.original_encoding = sniffed_encoding + + if not u: + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break + + # If no luck and we have auto-detection library, try that: + if not u and not isinstance(self.markup, unicode): + u = self._convert_from(chardet_dammit(self.markup)) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convert_from(proposed_encoding) + if u: + break + + # As an absolute last resort, try the encodings again with + # character replacement. + if not u: + for proposed_encoding in ( + override_encodings + [ + document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): + if proposed_encoding != "ascii": + u = self._convert_from(proposed_encoding, "replace") + if u is not None: + logging.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER.") + self.contains_replacement_characters = True + break + + # We could at this point force it to ASCII, but that would + # destroy so much data that I think giving up is better + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding, errors) + return newdata + + def _detectEncoding(self, xml_data, is_html=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == b'\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == b'\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ + and (xml_data[2:4] != b'\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ + (xml_data[2:4] != b'\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == b'\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_match = xml_encoding_re.match(xml_data) + if not xml_encoding_match and is_html: + xml_encoding_match = html_meta_re.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if is_html: + self.declared_html_encoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # Æ’ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Å  + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Å’ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # †+ 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # Ëœ + 0x99 : b'\xe2\x84\xa2', # â„¢ + 0x9a : b'\xc5\xa1', # Å¡ + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # Å“ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', #   + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # Â¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # ­ + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # à + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Ã… + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # ÃŒ + 0xcd : b'\xc3\x8d', # à + 0xce : b'\xc3\x8e', # ÃŽ + 0xcf : b'\xc3\x8f', # à + 0xd0 : b'\xc3\x90', # à + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ã’ + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # à + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # Ã¥ + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/bs4/element.py b/bs4/element.py new file mode 100644 index 00000000..26422fda --- /dev/null +++ b/bs4/element.py @@ -0,0 +1,1355 @@ +import collections +import re +import sys +import warnings +from bs4.dammit import EntitySubstitution + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +whitespace_re = re.compile("\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + # There are five possible values for the "formatter" argument passed in + # to methods like encode() and prettify(): + # + # "html" - All Unicode characters with corresponding HTML entities + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to + # XML entities: & < > + # None - The null formatter. Unicode characters are never + # converted to entities. This is not recommended, but it's + # faster than "minimal". + # A function - This function will be called on every string that + # needs to undergo entity substition + FORMATTERS = { + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, + None : None + } + + @classmethod + def format_string(self, s, formatter='minimal'): + """Format the given string using the given formatter.""" + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = s + else: + output = formatter(s) + return output + + def setup(self, parent=None, previous_element=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + self.next_element = None + self.previous_sibling = None + self.next_sibling = None + if self.parent is not None and self.parent.contents: + self.previous_sibling = self.parent.contents[-1] + self.previous_sibling.next_sibling = self + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if self.previous_element is not None: + self.previous_element.next_element = next_element + if next_element is not None: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if self.previous_sibling is not None: + self.previous_sibling.next_sibling = self.next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self): + "Finds the last element beneath this object to be parsed." + last_child = self + while hasattr(last_child, 'contents') and last_child.contents: + last_child = last_child.contents[-1] + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, basestring) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant() + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant() + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def insert_before(self, predecessor): + """Makes the given element the immediate predecessor of this one. + + The two elements will have the same parent, and the given element + will be immediately before this one. + """ + if self is predecessor: + raise ValueError("Can't insert an element before itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, successor): + """Makes the given element the immediate successor of this one. + + The two elements will have the same parent, and the given element + will be immediately after this one. + """ + if self is successor: + raise ValueError("Can't insert an element after itself.") + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1, successor) + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + elif text is None and not limit and not attrs and not kwargs: + # Optimization to find all tags. + if name is True or name is None: + return [element for element in generator + if isinstance(element, Tag)] + # Optimization to find all tags with a given name. + elif isinstance(name, basestring): + return [element for element in generator + if isinstance(element, Tag) and element.name == name] + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'=?"?(?P[^\]"]*)"?\]$' + ) + + def _attr_value_as_string(self, value, default=None): + """Force an attribute value into a string representation. + + A multi-valued attribute will be converted into a + space-separated stirng. + """ + value = self.get(value, default) + if isinstance(value, list) or isinstance(value, tuple): + value =" ".join(value) + return value + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of `attribute` is equal to `value` + return lambda el: el._attr_value_as_string(attribute) == value + elif operator == '~': + # space-separated list representation of `attribute` + # contains `value` + def _includes_value(element): + attribute_value = element.get(attribute, []) + if not isinstance(attribute_value, list): + attribute_value = attribute_value.split() + return value in attribute_value + return _includes_value + elif operator == '^': + # string representation of `attribute` starts with `value` + return lambda el: el._attr_value_as_string( + attribute, '').startswith(value) + elif operator == '$': + # string represenation of `attribute` ends with `value` + return lambda el: el._attr_value_as_string( + attribute, '').endswith(value) + elif operator == '*': + # string representation of `attribute` contains `value` + return lambda el: value in el._attr_value_as_string(attribute, '') + elif operator == '|': + # string representation of `attribute` is either exactly + # `value` or starts with `value` and then a dash. + def _is_or_starts_with_dash(element): + attribute_value = element._attr_value_as_string(attribute, '') + return (attribute_value == value or attribute_value.startswith( + value + '-')) + return _is_or_starts_with_dash + else: + return lambda el: el.has_attr(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + tokens = selector.split() + current_context = [self] + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + # already found direct descendants in last step. skip this + # step. + continue + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend( + [el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + + if '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + if not tag_name: + tag_name = True + classes = set(klass.split('.')) + found = [] + def classes_match(tag): + if tag_name is not True and tag.name != tag_name: + return False + if not tag.has_attr('class'): + return False + return classes.issubset(tag['class']) + for context in current_context: + found.extend(context.find_all(classes_match)) + current_context = found + continue + + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + current_context = found + continue + + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(unicode, PageElement): + + PREFIX = '' + SUFFIX = '' + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter="minimal"): + """CData strings are passed into the formatter. + But the return value is ignored.""" + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = u'' + +class ProcessingInstruction(PreformattedString): + + PREFIX = u'' + +class Comment(PreformattedString): + + PREFIX = u'' + + +class Declaration(PreformattedString): + PREFIX = u'' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = u'\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + # Set up any substitutions, such as the charset in a META tag. + if builder is not None: + builder.set_up_substitutions(self) + self.can_be_empty_element = builder.can_be_empty_element(name) + else: + self.can_be_empty_element = False + + parserClass = _alias("parser_class") # BS3 + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False): + """Yield all child strings, possibly stripping them.""" + for descendant in self.descendants: + if not isinstance(descendant, NavigableString): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings(strip)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag=="contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + attrs = [] + if self.attrs: + for key, val in sorted(self.attrs.items()): + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = unicode(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None): + val = val.encode(eventual_encoding) + + text = self.format_string(val, formatter) + decoded = ( + unicode(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = '/' + else: + closeTag = '' % (prefix, self.name) + + pretty_print = (indent_level is not None) + if pretty_print: + space = (' ' * (indent_level - 1)) + indent_contents = indent_level + 1 + else: + space = '' + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if pretty_print: + s.append(space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if pretty_print and closeTag and self.next_sibling: + s.append("\n") + s = ''.join(s) + return s + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + if text and indent_level: + text = text.strip() + if text: + if pretty_print: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring.""" + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + # This was kind of misleading because has_key() (attributes) was + # different from __in__ (contents). has_key() is gone in Python 3, + # anyway. + has_key = has_attr + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in attrs.items(): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, unicode)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return unicode(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, collections.Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, basestring): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + if (isinstance(match_against, unicode) + and ' ' in match_against): + # A bit of a special case. If they try to match "foo + # bar" on a multivalue attribute's value, only accept + # the literal value "foo bar" + # + # XXX This is going to be pretty slow because we keep + # splitting match_against. But it shouldn't come up + # too often. + return (whitespace_re.split(match_against) == markup) + else: + for item in markup: + if self._matches(item, match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, collections.Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if isinstance(match_against, unicode): + # Exact string match + return markup == match_against + + if hasattr(match_against, 'match'): + # Regexp match + return match_against.search(markup) + + if hasattr(match_against, '__iter__'): + # The markup must be an exact match against something + # in the iterable. + return markup in match_against + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source diff --git a/bs4/testing.py b/bs4/testing.py new file mode 100644 index 00000000..30e74f42 --- /dev/null +++ b/bs4/testing.py @@ -0,0 +1,537 @@ +"""Helper classes for tests.""" + +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + SoupStrainer, +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder() + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder.test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment): + """Generate and parse a document with the given doctype.""" + doctype = '' % doctype_fragment + markup = doctype + '\n

foo

' + soup = self.soup(markup) + return doctype, soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A

tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("

") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "

") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("

", "

") + self.assertSoupEquals("", "") + + self.assertSoupEquals("
", "
") + + def test_br_is_always_empty_element_tag(self): + """A
tag is designated as an empty-element tag. + + Some parsers treat

as one
tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("

") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "
") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("") + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in
 and ")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "Inside a B tag"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '
' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('', '') + + def test_entities_in_attributes_converted_to_unicode(self): + expect = u'

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("

I said "good day!"

", + '

I said "good day!"

') + + def test_out_of_range_entity(self): + expect = u"\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'4' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A bold statement", + parse_only=strainer) + self.assertEqual(soup.decode(), "bold") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("", + '') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """a""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """a""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """a""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('', + '') + + self.assertSoupEquals( + 'foo', + 'foo') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'
'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'
') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + self.assertEqual(soup.original_encoding, 'iso8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("text") + data.a['foo'] = 'bar' + self.assertEqual('text', data.a.decode()) + +class XMLTreeBuilderSmokeTest(object): + + def test_docstring_generated(self): + soup = self.soup("") + self.assertEqual( + soup.encode(), b'\n') + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( + unicode(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) + self.assertEqual(unicode(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/data/interfaces/default/base.html b/data/interfaces/default/base.html index 4fc8590b..152f9311 100644 --- a/data/interfaces/default/base.html +++ b/data/interfaces/default/base.html @@ -36,7 +36,7 @@ % elif headphones.CURRENT_VERSION != headphones.LATEST_VERSION and headphones.INSTALL_TYPE != 'win':
- A newer version is available. You're ${headphones.COMMITS_BEHIND} commits behind. Update or Close + A newer version is available. You're ${headphones.COMMITS_BEHIND} commits behind. Update or Close
% endif diff --git a/data/interfaces/default/config.html b/data/interfaces/default/config.html index 643af20e..edbf1834 100644 --- a/data/interfaces/default/config.html +++ b/data/interfaces/default/config.html @@ -302,6 +302,19 @@ m<%inherit file="base.html"/> +
+ +
+
+
+ + +
+
+ + +
+
@@ -926,6 +939,7 @@ m<%inherit file="base.html"/> initConfigCheckbox("#usenewzbin"); initConfigCheckbox("#usenzbsorg"); initConfigCheckbox("#usewaffles"); + initConfigCheckbox("#userutracker"); initConfigCheckbox("#useblackhole"); initConfigCheckbox("#useapi"); } diff --git a/data/interfaces/default/history.html b/data/interfaces/default/history.html index 162dab2d..616459cc 100644 --- a/data/interfaces/default/history.html +++ b/data/interfaces/default/history.html @@ -45,6 +45,8 @@ fileid = 'nzb' if item['URL'].find('torrent') != -1: fileid = 'torrent' + if item['URL'].find('rutracker') != -1: + fileid = 'torrent' %> ${item['DateAdded']} diff --git a/headphones/__init__.py b/headphones/__init__.py index 0e06ba4c..b22ad120 100644 --- a/headphones/__init__.py +++ b/headphones/__init__.py @@ -154,6 +154,9 @@ MININOVA = None WAFFLES = None WAFFLES_UID = None WAFFLES_PASSKEY = None +RUTRACKER = None +RUTRACKER_USER = None +RUTRACKER_PASSWORD = None DOWNLOAD_TORRENT_DIR = None INTERFACE = None @@ -248,7 +251,7 @@ def initialize(): LOSSLESS_DESTINATION_DIR, PREFERRED_QUALITY, PREFERRED_BITRATE, DETECT_BITRATE, ADD_ARTISTS, CORRECT_METADATA, MOVE_FILES, \ RENAME_FILES, FOLDER_FORMAT, FILE_FORMAT, CLEANUP_FILES, INCLUDE_EXTRAS, EXTRAS, AUTOWANT_UPCOMING, AUTOWANT_ALL, \ ADD_ALBUM_ART, EMBED_ALBUM_ART, EMBED_LYRICS, DOWNLOAD_DIR, BLACKHOLE, BLACKHOLE_DIR, USENET_RETENTION, SEARCH_INTERVAL, \ - TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, DOWNLOAD_TORRENT_DIR, \ + TORRENTBLACKHOLE_DIR, NUMBEROFSEEDERS, ISOHUNT, KAT, MININOVA, WAFFLES, WAFFLES_UID, WAFFLES_PASSKEY, RUTRACKER, RUTRACKER_USER, RUTRACKER_PASSWORD, DOWNLOAD_TORRENT_DIR, \ LIBRARYSCAN_INTERVAL, DOWNLOAD_SCAN_INTERVAL, SAB_HOST, SAB_USERNAME, SAB_PASSWORD, SAB_APIKEY, SAB_CATEGORY, \ NZBMATRIX, NZBMATRIX_USERNAME, NZBMATRIX_APIKEY, NEWZNAB, NEWZNAB_HOST, NEWZNAB_APIKEY, NEWZNAB_ENABLED, EXTRA_NEWZNABS,\ NZBSORG, NZBSORG_UID, NZBSORG_HASH, NEWZBIN, NEWZBIN_UID, NEWZBIN_PASSWORD, LASTFM_USERNAME, INTERFACE, FOLDER_PERMISSIONS, \ @@ -269,6 +272,7 @@ def initialize(): CheckSection('NZBsorg') CheckSection('Newzbin') CheckSection('Waffles') + CheckSection('Rutracker') CheckSection('Prowl') CheckSection('XBMC') CheckSection('NMA') @@ -342,6 +346,10 @@ def initialize(): WAFFLES = bool(check_setting_int(CFG, 'Waffles', 'waffles', 0)) WAFFLES_UID = check_setting_str(CFG, 'Waffles', 'waffles_uid', '') WAFFLES_PASSKEY = check_setting_str(CFG, 'Waffles', 'waffles_passkey', '') + + RUTRACKER = bool(check_setting_int(CFG, 'Rutracker', 'rutracker', 0)) + RUTRACKER_USER = check_setting_str(CFG, 'Rutracker', 'rutracker_user', '') + RUTRACKER_PASSWORD = check_setting_str(CFG, 'Rutracker', 'rutracker_password', '') SAB_HOST = check_setting_str(CFG, 'SABnzbd', 'sab_host', '') SAB_USERNAME = check_setting_str(CFG, 'SABnzbd', 'sab_username', '') @@ -620,6 +628,11 @@ def config_write(): new_config['Waffles']['waffles'] = int(WAFFLES) new_config['Waffles']['waffles_uid'] = WAFFLES_UID new_config['Waffles']['waffles_passkey'] = WAFFLES_PASSKEY + + new_config['Rutracker'] = {} + new_config['Rutracker']['rutracker'] = int(RUTRACKER) + new_config['Rutracker']['rutracker_user'] = RUTRACKER_USER + new_config['Rutracker']['rutracker_password'] = RUTRACKER_PASSWORD new_config['General']['search_interval'] = SEARCH_INTERVAL new_config['General']['libraryscan_interval'] = LIBRARYSCAN_INTERVAL diff --git a/headphones/searcher.py b/headphones/searcher.py index 7470c790..5c48b816 100644 --- a/headphones/searcher.py +++ b/headphones/searcher.py @@ -28,6 +28,9 @@ from headphones import logger, db, helpers, classes, sab import lib.bencode as bencode +import headphones.searcher_rutracker as rutrackersearch +rutracker = rutrackersearch.Rutracker() + class NewzbinDownloader(urllib.FancyURLopener): def __init__(self): @@ -97,7 +100,7 @@ def searchforalbum(albumid=None, new=False, lossless=False): else: foundNZB = searchNZB(result['AlbumID'], new) - if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none": + if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none": if result['Status'] == "Wanted Lossless": searchTorrent(result['AlbumID'], new, losslessOnly=True) else: @@ -109,7 +112,7 @@ def searchforalbum(albumid=None, new=False, lossless=False): if (headphones.NZBMATRIX or headphones.NEWZNAB or headphones.NZBSORG or headphones.NEWZBIN) and (headphones.SAB_HOST or headphones.BLACKHOLE): foundNZB = searchNZB(albumid, new, lossless) - if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES) and foundNZB == "none": + if (headphones.KAT or headphones.ISOHUNT or headphones.MININOVA or headphones.WAFFLES or headphones.RUTRACKER) and foundNZB == "none": searchTorrent(albumid, new, lossless) def searchNZB(albumid=None, new=False, losslessOnly=False): @@ -632,6 +635,13 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): results = myDB.select('SELECT ArtistName, AlbumTitle, AlbumID, ReleaseDate from albums WHERE Status="Wanted" OR Status="Wanted Lossless"') new = True + # rutracker login + + if headphones.RUTRACKER and results: + rulogin = rutracker.login(headphones.RUTRACKER_USER, headphones.RUTRACKER_PASSWORD) + if not rulogin: + logger.info(u'Could not login to rutracker, search results will exclude this provider') + for albums in results: albumid = albums[2] @@ -806,7 +816,54 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): except Exception, e: logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s" % e) - + + # rutracker.org + + if headphones.RUTRACKER and rulogin: + + provider = "rutracker.org" + + # Ignore if release date not specified, results too unpredictable + + if not year: + logger.info(u'Release date not specified, ignoring for rutracker.org') + else: + + bitrate = False + + if headphones.PREFERRED_QUALITY == 3 or losslessOnly: + format = 'lossless' + maxsize = 10000000000 + elif headphones.PREFERRED_QUALITY == 1: + format = 'lossless+mp3' + maxsize = 10000000000 + else: + format = 'mp3' + maxsize = 300000000 + if headphones.PREFERRED_QUALITY == 2 and headphones.PREFERRED_BITRATE: + bitrate = True + + # build search url based on above + + searchURL = rutracker.searchurl(artistterm, albumterm, year, format) + logger.info(u'Parsing results from rutracker.org' % searchURL) + + # parse results and get best match + + rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid, bitrate) + + # add best match to overall results list + + if rulist: + for ru in rulist: + title = ru[0].decode('utf-8') + size = ru[1] + url = ru[2] + resultlist.append((title, size, url, provider)) + logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size))) + else: + logger.info(u"No valid results found from %s" % (provider)) + if headphones.ISOHUNT: provider = "isoHunt" @@ -1029,19 +1086,24 @@ def searchTorrent(albumid=None, new=False, losslessOnly=False): # Get torrent name from .torrent, this is usually used by the torrent client as the folder name - torrent_name = torrent_folder_name + '.torrent' download_path = os.path.join(headphones.TORRENTBLACKHOLE_DIR, torrent_name) try: - #Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name. - torrent_file = open(download_path, 'wb') - torrent_file.write(data) - torrent_file.close() - #Open the fresh torrent file again so we can extract the proper torrent name - #Used later in post-processing. - torrent_file = open(download_path, 'rb') + if bestqual[3] == 'rutracker.org': + download_path = rutracker.get_torrent(bestqual[2], headphones.TORRENTBLACKHOLE_DIR) + if not download_path: + break + else: + #Write the torrent file to a path derived from the TORRENTBLACKHOLE_DIR and file name. + torrent_file = open(download_path, 'wb') + torrent_file.write(data) + torrent_file.close() + + #Open the fresh torrent file again so we can extract the proper torrent name + #Used later in post-processing. + torrent_file = open(download_path, 'rb') torrent_info = bencode.bdecode(torrent_file.read()) - torrent_file.close() + torrent_file.close() torrent_folder_name = torrent_info['info'].get('name','').decode('utf-8') logger.info('Torrent folder name: %s' % torrent_folder_name) except Exception, e: @@ -1058,7 +1120,12 @@ def preprocesstorrent(resultlist): selresult = result elif int(selresult[1]) < int(result[1]): # if size is lower than new result replace previous selected result (bigger size = better quality?) selresult = result - + + # get outta here if rutracker + + if selresult[3] == 'rutracker.org': + return True, selresult + try: request = urllib2.Request(selresult[2]) request.add_header('Accept-encoding', 'gzip') diff --git a/headphones/searcher_rutracker.py b/headphones/searcher_rutracker.py new file mode 100644 index 00000000..8bde0d1a --- /dev/null +++ b/headphones/searcher_rutracker.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Headphones rutracker.org search +# Functions called from searcher.py + +import urllib +import urllib2 +import cookielib +from urlparse import urlparse +from bs4 import BeautifulSoup +from headphones import logger, db +import lib.bencode as bencode +import os + +class Rutracker(): + + logged_in = False + # Stores a number of login attempts to prevent recursion. + #login_counter = 0 + + def __init__(self): + + self.cookiejar = cookielib.CookieJar() + self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar)) + urllib2.install_opener(self.opener) + + def login(self, login, password): + """Implements tracker login procedure.""" + + self.logged_in = False + + if login is None or password is None: + return False + + #self.login_counter += 1 + + # No recursion wanted. + #if self.login_counter > 1: + # return False + + params = urllib.urlencode({"login_username" : login, + "login_password" : password, + "login" : "Вход"}) + + try: + self.opener.open("http://login.rutracker.org/forum/login.php", params) + except : + pass + + # Check if we're logged in + + for cookie in self.cookiejar: + if cookie.name == 'bb_data': + self.logged_in = True + + return self.logged_in + + def searchurl(self, artist, album, year, format): + """ + Return the search url + """ + + # Build search url + + searchterm = '' + if artist != 'Various Artists': + searchterm = artist + searchterm = searchterm + ' ' + searchterm = searchterm + album + searchterm = searchterm + ' ' + searchterm = searchterm + year + + providerurl = "http://rutracker.org/forum/tracker.php" + + if format == 'lossless': + format = '+lossless' + elif format == 'lossless+mp3': + format = '+lossless||mp3||aac' + else: + format = '+mp3||aac' + + # sort by size, descending. + + sort = '&o=7&s=2' + + searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort) + + return searchurl + + def search(self, searchurl, maxsize, minseeders, albumid, bitrate): + """ + Parse the search results and return the first valid torrent + """ + + titles = [] + urls = [] + seeders = [] + sizes = [] + torrentlist = [] + rulist = [] + + try: + + page = self.opener.open(searchurl, timeout=60) + soup = BeautifulSoup(page.read()) + + # Debug + #logger.debug (soup.prettify()) + + # Title + + for link in soup.find_all('a', attrs={'class' : 'med tLink bold'}): + title = link.get_text() + titles.append(title) + + # Download URL + + for link in soup.find_all('a', attrs={'class' : 'small tr-dl dl-stub'}): + url = link.get('href') + urls.append(url) + + # Seeders + + for link in soup.find_all('td', attrs={'class' : 'row4 seedmed'}): + seeder = link.get_text() + seeders.append(seeder) + + # Size + + for link in soup.find_all('td', attrs={'class' : 'row4 small nowrap tor-size'}): + size = link.u.string + sizes.append(size) + + except : + pass + + # Combine lists + + torrentlist = zip(titles, urls, seeders, sizes) + + # return if nothing found + + if not torrentlist: + return False + + # get headphones track count for album, return if not found + + hptrackcount = 0 + + myDB = db.DBConnection() + tracks = myDB.select('SELECT TrackTitle from tracks WHERE AlbumID=?', [albumid]) + for track in tracks: + hptrackcount += 1 + + if not hptrackcount: + logger.info('headphones track info not found, cannot compare to torrent') + return False + + # Return the first valid torrent, unless we want a preferred bitrate then we want all valid entries + + for torrent in torrentlist: + + returntitle = torrent[0].encode('utf-8') + url = torrent[1] + seeders = torrent[2] + size = torrent[3] + + # Attempt to filter out unwanted + + title = returntitle.lower() + + if 'promo' not in title and 'vinyl' not in title and 'songbook' not in title and 'tvrip' not in title and 'hdtv' not in title and 'dvd' not in title \ + and int(size) <= maxsize and int(seeders) >= minseeders: + + # Check torrent info + + torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] + self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) + + # Debug + #for cookie in self.cookiejar: + # logger.debug ('Cookie: %s' % cookie) + + try: + page = self.opener.open(url) + torrent = page.read() + if torrent: + decoded = bencode.bdecode(torrent) + metainfo = decoded['info'] + page.close () + except Exception, e: + logger.error('Error getting torrent: %s' % e) + return False + + # get torrent track count and check for cue + + trackcount = 0 + cuecount = 0 + + if 'files' in metainfo: # multi + for pathfile in metainfo['files']: + path = pathfile['path'] + for file in path: + if '.ape' in file or '.flac' in file or '.ogg' in file or '.m4a' in file or '.aac' in file or '.mp3' in file or '.wav' in file or '.aif' in file: + trackcount += 1 + if '.cue' in file: + cuecount += 1 + + #Torrent topic page + + topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id + logger.debug ('torrent title: %s' % title) + logger.debug ('headphones trackcount: %s' % hptrackcount) + logger.debug ('rutracker trackcount: %s' % trackcount) + + # If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s) + # This is for the case where we have a single .flac/.wav which can be split by cue + # Not great, but shouldn't be doing this too often + + totallogcount = 0 + if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount: + page = self.opener.open(topicurl, timeout=60) + soup = BeautifulSoup(page.read()) + findtoc = soup.find_all(text='TOC of the extracted CD') + if not findtoc: + findtoc = soup.find_all(text='TOC извлечённого CD') + for toc in findtoc: + logcount = 0 + for toccontent in toc.find_all_next(text=True): + cut_string = toccontent.split('|') + new_string = cut_string[0].lstrip().rstrip() + if new_string == '1' or new_string == '01': + logcount = 1 + elif logcount > 0: + if new_string.isdigit(): + logcount += 1 + else: + break + totallogcount = totallogcount + logcount + + if totallogcount > 0: + trackcount = totallogcount + logger.debug ('rutracker logtrackcount: %s' % totallogcount) + + # If torrent track count = hp track count then return torrent, + # if greater, check for deluxe/special/foreign editions + # if less, then allow if it's a single track with a cue + + valid = False + + if trackcount == hptrackcount: + valid = True + elif trackcount > hptrackcount: + if 'deluxe' in title or 'edition' in title or 'japanese' in title: + valid = True + + # return 1st valid torrent if not checking by bitrate, else add to list and return at end + + if valid: + rulist.append((returntitle, size, topicurl)) + if not bitrate: + return rulist + + return rulist + + + def get_torrent(self, url, savelocation): + + torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] + self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) + downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id + torrent_name = torrent_id + '.torrent' + download_path = os.path.join(savelocation, torrent_name) + + try: + page = self.opener.open(downloadurl) + torrent = page.read() + fp = open (download_path, 'wb') + fp.write (torrent) + fp.close () + except Exception, e: + logger.error('Error getting torrent: %s' % e) + return False + + return download_path + diff --git a/headphones/versioncheck.py b/headphones/versioncheck.py index 429ec8b9..e1440fd8 100644 --- a/headphones/versioncheck.py +++ b/headphones/versioncheck.py @@ -20,7 +20,7 @@ from headphones import logger, version import lib.simplejson as simplejson -user = "rembo10" +user = "AdeHub" branch = "master" def runGit(args): diff --git a/headphones/webserve.py b/headphones/webserve.py index 799d84f3..d985dc2b 100644 --- a/headphones/webserve.py +++ b/headphones/webserve.py @@ -463,6 +463,9 @@ class WebInterface(object): "use_waffles" : checked(headphones.WAFFLES), "waffles_uid" : headphones.WAFFLES_UID, "waffles_passkey": headphones.WAFFLES_PASSKEY, + "use_rutracker" : checked(headphones.RUTRACKER), + "rutracker_user" : headphones.RUTRACKER_USER, + "rutracker_password": headphones.RUTRACKER_PASSWORD, "pref_qual_0" : radio(headphones.PREFERRED_QUALITY, 0), "pref_qual_1" : radio(headphones.PREFERRED_QUALITY, 1), "pref_qual_3" : radio(headphones.PREFERRED_QUALITY, 3), @@ -545,7 +548,7 @@ class WebInterface(object): sab_category=None, download_dir=None, blackhole=0, blackhole_dir=None, usenet_retention=None, nzbmatrix=0, nzbmatrix_username=None, nzbmatrix_apikey=None, newznab=0, newznab_host=None, newznab_apikey=None, newznab_enabled=0, nzbsorg=0, nzbsorg_uid=None, nzbsorg_hash=None, newzbin=0, newzbin_uid=None, newzbin_password=None, preferred_quality=0, preferred_bitrate=None, detect_bitrate=0, move_files=0, torrentblackhole_dir=None, download_torrent_dir=None, - numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rename_files=0, correct_metadata=0, + numberofseeders=10, use_isohunt=0, use_kat=0, use_mininova=0, waffles=0, waffles_uid=None, waffles_passkey=None, rutracker=0, rutracker_user=None, rutracker_password=None, rename_files=0, correct_metadata=0, cleanup_files=0, add_album_art=0, embed_album_art=0, embed_lyrics=0, destination_dir=None, lossless_destination_dir=None, folder_format=None, file_format=None, include_extras=0, single=0, ep=0, compilation=0, soundtrack=0, live=0, remix=0, spokenword=0, audiobook=0, autowant_upcoming=False, autowant_all=False, interface=None, log_dir=None, music_encoder=0, encoder=None, bitrate=None, samplingfrequency=None, encoderfolder=None, advancedencoder=None, @@ -595,6 +598,9 @@ class WebInterface(object): headphones.WAFFLES = waffles headphones.WAFFLES_UID = waffles_uid headphones.WAFFLES_PASSKEY = waffles_passkey + headphones.RUTRACKER = rutracker + headphones.RUTRACKER_USER = rutracker_user + headphones.RUTRACKER_PASSWORD = rutracker_password headphones.PREFERRED_QUALITY = int(preferred_quality) headphones.PREFERRED_BITRATE = preferred_bitrate headphones.PREFERRED_BITRATE_HIGH_BUFFER = preferred_bitrate_high_buffer diff --git a/html5lib/__init__.py b/html5lib/__init__.py new file mode 100644 index 00000000..16537aad --- /dev/null +++ b/html5lib/__init__.py @@ -0,0 +1,17 @@ +""" +HTML parsing library based on the WHATWG "HTML5" +specification. The parser is designed to be compatible with existing +HTML found in the wild and implements well-defined error recovery that +is largely compatible with modern desktop web browsers. + +Example usage: + +import html5lib +f = open("my_document.html") +tree = html5lib.parse(f) +""" +__version__ = "0.95-dev" +from html5parser import HTMLParser, parse, parseFragment +from treebuilders import getTreeBuilder +from treewalkers import getTreeWalker +from serializer import serialize diff --git a/html5lib/constants.py b/html5lib/constants.py new file mode 100644 index 00000000..b533018e --- /dev/null +++ b/html5lib/constants.py @@ -0,0 +1,3085 @@ +import string, gettext +_ = gettext.gettext + +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +EOF = None + +E = { + "null-character": + _(u"Null character in input stream, replaced with U+FFFD."), + "invalid-codepoint": + _(u"Invalid codepoint in stream."), + "incorrectly-placed-solidus": + _(u"Solidus (/) incorrectly placed in tag."), + "incorrect-cr-newline-entity": + _(u"Incorrect CR newline entity, replaced with LF."), + "illegal-windows-1252-entity": + _(u"Entity used with illegal number (windows-1252 reference)."), + "cant-convert-numeric-entity": + _(u"Numeric entity couldn't be converted to character " + u"(codepoint U+%(charAsInt)08x)."), + "illegal-codepoint-for-numeric-entity": + _(u"Numeric entity represents an illegal codepoint: " + u"U+%(charAsInt)08x."), + "numeric-entity-without-semicolon": + _(u"Numeric entity didn't end with ';'."), + "expected-numeric-entity-but-got-eof": + _(u"Numeric entity expected. Got end of file instead."), + "expected-numeric-entity": + _(u"Numeric entity expected but none found."), + "named-entity-without-semicolon": + _(u"Named entity didn't end with ';'."), + "expected-named-entity": + _(u"Named entity expected. Got none."), + "attributes-in-end-tag": + _(u"End tag contains unexpected attributes."), + 'self-closing-flag-on-end-tag': + _(u"End tag contains unexpected self-closing flag."), + "expected-tag-name-but-got-right-bracket": + _(u"Expected tag name. Got '>' instead."), + "expected-tag-name-but-got-question-mark": + _(u"Expected tag name. Got '?' instead. (HTML doesn't " + u"support processing instructions.)"), + "expected-tag-name": + _(u"Expected tag name. Got something else instead"), + "expected-closing-tag-but-got-right-bracket": + _(u"Expected closing tag. Got '>' instead. Ignoring ''."), + "expected-closing-tag-but-got-eof": + _(u"Expected closing tag. Unexpected end of file."), + "expected-closing-tag-but-got-char": + _(u"Expected closing tag. Unexpected character '%(data)s' found."), + "eof-in-tag-name": + _(u"Unexpected end of file in the tag name."), + "expected-attribute-name-but-got-eof": + _(u"Unexpected end of file. Expected attribute name instead."), + "eof-in-attribute-name": + _(u"Unexpected end of file in attribute name."), + "invalid-character-in-attribute-name": + _(u"Invalid chracter in attribute name"), + "duplicate-attribute": + _(u"Dropped duplicate attribute on tag."), + "expected-end-of-tag-name-but-got-eof": + _(u"Unexpected end of file. Expected = or end of tag."), + "expected-attribute-value-but-got-eof": + _(u"Unexpected end of file. Expected attribute value."), + "expected-attribute-value-but-got-right-bracket": + _(u"Expected attribute value. Got '>' instead."), + 'equals-in-unquoted-attribute-value': + _(u"Unexpected = in unquoted attribute"), + 'unexpected-character-in-unquoted-attribute-value': + _(u"Unexpected character in unquoted attribute"), + "invalid-character-after-attribute-name": + _(u"Unexpected character after attribute name."), + "unexpected-character-after-attribute-value": + _(u"Unexpected character after attribute value."), + "eof-in-attribute-value-double-quote": + _(u"Unexpected end of file in attribute value (\")."), + "eof-in-attribute-value-single-quote": + _(u"Unexpected end of file in attribute value (')."), + "eof-in-attribute-value-no-quotes": + _(u"Unexpected end of file in attribute value."), + "unexpected-EOF-after-solidus-in-tag": + _(u"Unexpected end of file in tag. Expected >"), + "unexpected-character-after-soldius-in-tag": + _(u"Unexpected character after / in tag. Expected >"), + "expected-dashes-or-doctype": + _(u"Expected '--' or 'DOCTYPE'. Not found."), + "unexpected-bang-after-double-dash-in-comment": + _(u"Unexpected ! after -- in comment"), + "unexpected-space-after-double-dash-in-comment": + _(u"Unexpected space after -- in comment"), + "incorrect-comment": + _(u"Incorrect comment."), + "eof-in-comment": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-dash": + _(u"Unexpected end of file in comment (-)"), + "unexpected-dash-after-double-dash-in-comment": + _(u"Unexpected '-' after '--' found in comment."), + "eof-in-comment-double-dash": + _(u"Unexpected end of file in comment (--)."), + "eof-in-comment-end-space-state": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-bang-state": + _(u"Unexpected end of file in comment."), + "unexpected-char-in-comment": + _(u"Unexpected character in comment found."), + "need-space-after-doctype": + _(u"No space after literal string 'DOCTYPE'."), + "expected-doctype-name-but-got-right-bracket": + _(u"Unexpected > character. Expected DOCTYPE name."), + "expected-doctype-name-but-got-eof": + _(u"Unexpected end of file. Expected DOCTYPE name."), + "eof-in-doctype-name": + _(u"Unexpected end of file in DOCTYPE name."), + "eof-in-doctype": + _(u"Unexpected end of file in DOCTYPE."), + "expected-space-or-right-bracket-in-doctype": + _(u"Expected space or '>'. Got '%(data)s'"), + "unexpected-end-of-doctype": + _(u"Unexpected end of DOCTYPE."), + "unexpected-char-in-doctype": + _(u"Unexpected character in DOCTYPE."), + "eof-in-innerhtml": + _(u"XXX innerHTML EOF"), + "unexpected-doctype": + _(u"Unexpected DOCTYPE. Ignored."), + "non-html-root": + _(u"html needs to be the first start tag."), + "expected-doctype-but-got-eof": + _(u"Unexpected End of file. Expected DOCTYPE."), + "unknown-doctype": + _(u"Erroneous DOCTYPE."), + "expected-doctype-but-got-chars": + _(u"Unexpected non-space characters. Expected DOCTYPE."), + "expected-doctype-but-got-start-tag": + _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."), + "expected-doctype-but-got-end-tag": + _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."), + "end-tag-after-implied-root": + _(u"Unexpected end tag (%(name)s) after the (implied) root element."), + "expected-named-closing-tag-but-got-eof": + _(u"Unexpected end of file. Expected end tag (%(name)s)."), + "two-heads-are-not-better-than-one": + _(u"Unexpected start tag head in existing head. Ignored."), + "unexpected-end-tag": + _(u"Unexpected end tag (%(name)s). Ignored."), + "unexpected-start-tag-out-of-my-head": + _(u"Unexpected start tag (%(name)s) that can be in head. Moved."), + "unexpected-start-tag": + _(u"Unexpected start tag (%(name)s)."), + "missing-end-tag": + _(u"Missing end tag (%(name)s)."), + "missing-end-tags": + _(u"Missing end tags (%(name)s)."), + "unexpected-start-tag-implies-end-tag": + _(u"Unexpected start tag (%(startName)s) " + u"implies end tag (%(endName)s)."), + "unexpected-start-tag-treated-as": + _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "deprecated-tag": + _(u"Unexpected start tag %(name)s. Don't use it!"), + "unexpected-start-tag-ignored": + _(u"Unexpected start tag %(name)s. Ignored."), + "expected-one-end-tag-but-got-another": + _(u"Unexpected end tag (%(gotName)s). " + u"Missing end tag (%(expectedName)s)."), + "end-tag-too-early": + _(u"End tag (%(name)s) seen too early. Expected other end tag."), + "end-tag-too-early-named": + _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "end-tag-too-early-ignored": + _(u"End tag (%(name)s) seen too early. Ignored."), + "adoption-agency-1.1": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 1 of the adoption agency algorithm."), + "adoption-agency-1.2": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 2 of the adoption agency algorithm."), + "adoption-agency-1.3": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 3 of the adoption agency algorithm."), + "unexpected-end-tag-treated-as": + _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "no-end-tag": + _(u"This element (%(name)s) has no end tag."), + "unexpected-implied-end-tag-in-table": + _(u"Unexpected implied end tag (%(name)s) in the table phase."), + "unexpected-implied-end-tag-in-table-body": + _(u"Unexpected implied end tag (%(name)s) in the table body phase."), + "unexpected-char-implies-table-voodoo": + _(u"Unexpected non-space characters in " + u"table context caused voodoo mode."), + "unexpected-hidden-input-in-table": + _(u"Unexpected input with type hidden in table context."), + "unexpected-form-in-table": + _(u"Unexpected form in table context."), + "unexpected-start-tag-implies-table-voodoo": + _(u"Unexpected start tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-end-tag-implies-table-voodoo": + _(u"Unexpected end tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-cell-in-table-body": + _(u"Unexpected table cell start tag (%(name)s) " + u"in the table body phase."), + "unexpected-cell-end-tag": + _(u"Got table cell end tag (%(name)s) " + u"while required end tags are missing."), + "unexpected-end-tag-in-table-body": + _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "unexpected-implied-end-tag-in-table-row": + _(u"Unexpected implied end tag (%(name)s) in the table row phase."), + "unexpected-end-tag-in-table-row": + _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "unexpected-select-in-select": + _(u"Unexpected select start tag in the select phase " + u"treated as select end tag."), + "unexpected-input-in-select": + _(u"Unexpected input start tag in the select phase."), + "unexpected-start-tag-in-select": + _(u"Unexpected start tag token (%(name)s in the select phase. " + u"Ignored."), + "unexpected-end-tag-in-select": + _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."), + "unexpected-table-element-start-tag-in-select-in-table": + _(u"Unexpected table element start tag (%(name)s) in the select in table phase."), + "unexpected-table-element-end-tag-in-select-in-table": + _(u"Unexpected table element end tag (%(name)s) in the select in table phase."), + "unexpected-char-after-body": + _(u"Unexpected non-space characters in the after body phase."), + "unexpected-start-tag-after-body": + _(u"Unexpected start tag token (%(name)s)" + u" in the after body phase."), + "unexpected-end-tag-after-body": + _(u"Unexpected end tag token (%(name)s)" + u" in the after body phase."), + "unexpected-char-in-frameset": + _(u"Unepxected characters in the frameset phase. Characters ignored."), + "unexpected-start-tag-in-frameset": + _(u"Unexpected start tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-frameset-in-frameset-innerhtml": + _(u"Unexpected end tag token (frameset) " + u"in the frameset phase (innerHTML)."), + "unexpected-end-tag-in-frameset": + _(u"Unexpected end tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-char-after-frameset": + _(u"Unexpected non-space characters in the " + u"after frameset phase. Ignored."), + "unexpected-start-tag-after-frameset": + _(u"Unexpected start tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-frameset": + _(u"Unexpected end tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-body-innerhtml": + _(u"Unexpected end tag after body(innerHtml)"), + "expected-eof-but-got-char": + _(u"Unexpected non-space characters. Expected end of file."), + "expected-eof-but-got-start-tag": + _(u"Unexpected start tag (%(name)s)" + u". Expected end of file."), + "expected-eof-but-got-end-tag": + _(u"Unexpected end tag (%(name)s)" + u". Expected end of file."), + "eof-in-table": + _(u"Unexpected end of file. Expected table content."), + "eof-in-select": + _(u"Unexpected end of file. Expected select content."), + "eof-in-frameset": + _(u"Unexpected end of file. Expected frameset content."), + "eof-in-script-in-script": + _(u"Unexpected end of file. Expected script content."), + "eof-in-foreign-lands": + _(u"Unexpected end of file. Expected foreign content"), + "non-void-element-with-trailing-solidus": + _(u"Trailing solidus not allowed on element %(name)s"), + "unexpected-html-element-in-foreign-content": + _(u"Element %(name)s not allowed in a non-html context"), + "unexpected-end-tag-before-html": + _(u"Unexpected end tag (%(name)s) before html."), + "XXX-undefined-error": + (u"Undefined error (this sucks and should be fixed)"), +} + +namespaces = { + "html":"http://www.w3.org/1999/xhtml", + "mathml":"http://www.w3.org/1998/Math/MathML", + "svg":"http://www.w3.org/2000/svg", + "xlink":"http://www.w3.org/1999/xlink", + "xml":"http://www.w3.org/XML/1998/namespace", + "xmlns":"http://www.w3.org/2000/xmlns/" +} + +scopingElements = frozenset(( + (namespaces["html"], "applet"), + (namespaces["html"], "caption"), + (namespaces["html"], "html"), + (namespaces["html"], "marquee"), + (namespaces["html"], "object"), + (namespaces["html"], "table"), + (namespaces["html"], "td"), + (namespaces["html"], "th"), + (namespaces["mathml"], "mi"), + (namespaces["mathml"], "mo"), + (namespaces["mathml"], "mn"), + (namespaces["mathml"], "ms"), + (namespaces["mathml"], "mtext"), + (namespaces["mathml"], "annotation-xml"), + (namespaces["svg"], "foreignObject"), + (namespaces["svg"], "desc"), + (namespaces["svg"], "title"), +)) + +formattingElements = frozenset(( + (namespaces["html"], "a"), + (namespaces["html"], "b"), + (namespaces["html"], "big"), + (namespaces["html"], "code"), + (namespaces["html"], "em"), + (namespaces["html"], "font"), + (namespaces["html"], "i"), + (namespaces["html"], "nobr"), + (namespaces["html"], "s"), + (namespaces["html"], "small"), + (namespaces["html"], "strike"), + (namespaces["html"], "strong"), + (namespaces["html"], "tt"), + (namespaces["html"], "u") +)) + +specialElements = frozenset(( + (namespaces["html"], "address"), + (namespaces["html"], "applet"), + (namespaces["html"], "area"), + (namespaces["html"], "article"), + (namespaces["html"], "aside"), + (namespaces["html"], "base"), + (namespaces["html"], "basefont"), + (namespaces["html"], "bgsound"), + (namespaces["html"], "blockquote"), + (namespaces["html"], "body"), + (namespaces["html"], "br"), + (namespaces["html"], "button"), + (namespaces["html"], "caption"), + (namespaces["html"], "center"), + (namespaces["html"], "col"), + (namespaces["html"], "colgroup"), + (namespaces["html"], "command"), + (namespaces["html"], "dd"), + (namespaces["html"], "details"), + (namespaces["html"], "dir"), + (namespaces["html"], "div"), + (namespaces["html"], "dl"), + (namespaces["html"], "dt"), + (namespaces["html"], "embed"), + (namespaces["html"], "fieldset"), + (namespaces["html"], "figure"), + (namespaces["html"], "footer"), + (namespaces["html"], "form"), + (namespaces["html"], "frame"), + (namespaces["html"], "frameset"), + (namespaces["html"], "h1"), + (namespaces["html"], "h2"), + (namespaces["html"], "h3"), + (namespaces["html"], "h4"), + (namespaces["html"], "h5"), + (namespaces["html"], "h6"), + (namespaces["html"], "head"), + (namespaces["html"], "header"), + (namespaces["html"], "hr"), + (namespaces["html"], "html"), + (namespaces["html"], "iframe"), + # Note that image is commented out in the spec as "this isn't an + # element that can end up on the stack, so it doesn't matter," + (namespaces["html"], "image"), + (namespaces["html"], "img"), + (namespaces["html"], "input"), + (namespaces["html"], "isindex"), + (namespaces["html"], "li"), + (namespaces["html"], "link"), + (namespaces["html"], "listing"), + (namespaces["html"], "marquee"), + (namespaces["html"], "menu"), + (namespaces["html"], "meta"), + (namespaces["html"], "nav"), + (namespaces["html"], "noembed"), + (namespaces["html"], "noframes"), + (namespaces["html"], "noscript"), + (namespaces["html"], "object"), + (namespaces["html"], "ol"), + (namespaces["html"], "p"), + (namespaces["html"], "param"), + (namespaces["html"], "plaintext"), + (namespaces["html"], "pre"), + (namespaces["html"], "script"), + (namespaces["html"], "section"), + (namespaces["html"], "select"), + (namespaces["html"], "style"), + (namespaces["html"], "table"), + (namespaces["html"], "tbody"), + (namespaces["html"], "td"), + (namespaces["html"], "textarea"), + (namespaces["html"], "tfoot"), + (namespaces["html"], "th"), + (namespaces["html"], "thead"), + (namespaces["html"], "title"), + (namespaces["html"], "tr"), + (namespaces["html"], "ul"), + (namespaces["html"], "wbr"), + (namespaces["html"], "xmp"), + (namespaces["svg"], "foreignObject") +)) + +htmlIntegrationPointElements = frozenset(( + (namespaces["mathml"], "annotaion-xml"), + (namespaces["svg"], "foreignObject"), + (namespaces["svg"], "desc"), + (namespaces["svg"], "title") +)) + +mathmlTextIntegrationPointElements = frozenset(( + (namespaces["mathml"], "mi"), + (namespaces["mathml"], "mo"), + (namespaces["mathml"], "mn"), + (namespaces["mathml"], "ms"), + (namespaces["mathml"], "mtext") +)) + +spaceCharacters = frozenset(( + u"\t", + u"\n", + u"\u000C", + u" ", + u"\r" +)) + +tableInsertModeElements = frozenset(( + "table", + "tbody", + "tfoot", + "thead", + "tr" +)) + +asciiLowercase = frozenset(string.ascii_lowercase) +asciiUppercase = frozenset(string.ascii_uppercase) +asciiLetters = frozenset(string.ascii_letters) +digits = frozenset(string.digits) +hexDigits = frozenset(string.hexdigits) + +asciiUpper2Lower = dict([(ord(c),ord(c.lower())) + for c in string.ascii_uppercase]) + +# Heading elements need to be ordered +headingElements = ( + "h1", + "h2", + "h3", + "h4", + "h5", + "h6" +) + +voidElements = frozenset(( + "base", + "command", + "event-source", + "link", + "meta", + "hr", + "br", + "img", + "embed", + "param", + "area", + "col", + "input", + "source", + "track" +)) + +cdataElements = frozenset(('title', 'textarea')) + +rcdataElements = frozenset(( + 'style', + 'script', + 'xmp', + 'iframe', + 'noembed', + 'noframes', + 'noscript' +)) + +booleanAttributes = { + "": frozenset(("irrelevant",)), + "style": frozenset(("scoped",)), + "img": frozenset(("ismap",)), + "audio": frozenset(("autoplay","controls")), + "video": frozenset(("autoplay","controls")), + "script": frozenset(("defer", "async")), + "details": frozenset(("open",)), + "datagrid": frozenset(("multiple", "disabled")), + "command": frozenset(("hidden", "disabled", "checked", "default")), + "hr": frozenset(("noshade")), + "menu": frozenset(("autosubmit",)), + "fieldset": frozenset(("disabled", "readonly")), + "option": frozenset(("disabled", "readonly", "selected")), + "optgroup": frozenset(("disabled", "readonly")), + "button": frozenset(("disabled", "autofocus")), + "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), + "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), + "output": frozenset(("disabled", "readonly")), +} + +# entitiesWindows1252 has to be _ordered_ and needs to have an index. It +# therefore can't be a frozenset. +entitiesWindows1252 = ( + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS +) + +xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) + +entities = { + "AElig": u"\xc6", + "AElig;": u"\xc6", + "AMP": u"&", + "AMP;": u"&", + "Aacute": u"\xc1", + "Aacute;": u"\xc1", + "Abreve;": u"\u0102", + "Acirc": u"\xc2", + "Acirc;": u"\xc2", + "Acy;": u"\u0410", + "Afr;": u"\U0001d504", + "Agrave": u"\xc0", + "Agrave;": u"\xc0", + "Alpha;": u"\u0391", + "Amacr;": u"\u0100", + "And;": u"\u2a53", + "Aogon;": u"\u0104", + "Aopf;": u"\U0001d538", + "ApplyFunction;": u"\u2061", + "Aring": u"\xc5", + "Aring;": u"\xc5", + "Ascr;": u"\U0001d49c", + "Assign;": u"\u2254", + "Atilde": u"\xc3", + "Atilde;": u"\xc3", + "Auml": u"\xc4", + "Auml;": u"\xc4", + "Backslash;": u"\u2216", + "Barv;": u"\u2ae7", + "Barwed;": u"\u2306", + "Bcy;": u"\u0411", + "Because;": u"\u2235", + "Bernoullis;": u"\u212c", + "Beta;": u"\u0392", + "Bfr;": u"\U0001d505", + "Bopf;": u"\U0001d539", + "Breve;": u"\u02d8", + "Bscr;": u"\u212c", + "Bumpeq;": u"\u224e", + "CHcy;": u"\u0427", + "COPY": u"\xa9", + "COPY;": u"\xa9", + "Cacute;": u"\u0106", + "Cap;": u"\u22d2", + "CapitalDifferentialD;": u"\u2145", + "Cayleys;": u"\u212d", + "Ccaron;": u"\u010c", + "Ccedil": u"\xc7", + "Ccedil;": u"\xc7", + "Ccirc;": u"\u0108", + "Cconint;": u"\u2230", + "Cdot;": u"\u010a", + "Cedilla;": u"\xb8", + "CenterDot;": u"\xb7", + "Cfr;": u"\u212d", + "Chi;": u"\u03a7", + "CircleDot;": u"\u2299", + "CircleMinus;": u"\u2296", + "CirclePlus;": u"\u2295", + "CircleTimes;": u"\u2297", + "ClockwiseContourIntegral;": u"\u2232", + "CloseCurlyDoubleQuote;": u"\u201d", + "CloseCurlyQuote;": u"\u2019", + "Colon;": u"\u2237", + "Colone;": u"\u2a74", + "Congruent;": u"\u2261", + "Conint;": u"\u222f", + "ContourIntegral;": u"\u222e", + "Copf;": u"\u2102", + "Coproduct;": u"\u2210", + "CounterClockwiseContourIntegral;": u"\u2233", + "Cross;": u"\u2a2f", + "Cscr;": u"\U0001d49e", + "Cup;": u"\u22d3", + "CupCap;": u"\u224d", + "DD;": u"\u2145", + "DDotrahd;": u"\u2911", + "DJcy;": u"\u0402", + "DScy;": u"\u0405", + "DZcy;": u"\u040f", + "Dagger;": u"\u2021", + "Darr;": u"\u21a1", + "Dashv;": u"\u2ae4", + "Dcaron;": u"\u010e", + "Dcy;": u"\u0414", + "Del;": u"\u2207", + "Delta;": u"\u0394", + "Dfr;": u"\U0001d507", + "DiacriticalAcute;": u"\xb4", + "DiacriticalDot;": u"\u02d9", + "DiacriticalDoubleAcute;": u"\u02dd", + "DiacriticalGrave;": u"`", + "DiacriticalTilde;": u"\u02dc", + "Diamond;": u"\u22c4", + "DifferentialD;": u"\u2146", + "Dopf;": u"\U0001d53b", + "Dot;": u"\xa8", + "DotDot;": u"\u20dc", + "DotEqual;": u"\u2250", + "DoubleContourIntegral;": u"\u222f", + "DoubleDot;": u"\xa8", + "DoubleDownArrow;": u"\u21d3", + "DoubleLeftArrow;": u"\u21d0", + "DoubleLeftRightArrow;": u"\u21d4", + "DoubleLeftTee;": u"\u2ae4", + "DoubleLongLeftArrow;": u"\u27f8", + "DoubleLongLeftRightArrow;": u"\u27fa", + "DoubleLongRightArrow;": u"\u27f9", + "DoubleRightArrow;": u"\u21d2", + "DoubleRightTee;": u"\u22a8", + "DoubleUpArrow;": u"\u21d1", + "DoubleUpDownArrow;": u"\u21d5", + "DoubleVerticalBar;": u"\u2225", + "DownArrow;": u"\u2193", + "DownArrowBar;": u"\u2913", + "DownArrowUpArrow;": u"\u21f5", + "DownBreve;": u"\u0311", + "DownLeftRightVector;": u"\u2950", + "DownLeftTeeVector;": u"\u295e", + "DownLeftVector;": u"\u21bd", + "DownLeftVectorBar;": u"\u2956", + "DownRightTeeVector;": u"\u295f", + "DownRightVector;": u"\u21c1", + "DownRightVectorBar;": u"\u2957", + "DownTee;": u"\u22a4", + "DownTeeArrow;": u"\u21a7", + "Downarrow;": u"\u21d3", + "Dscr;": u"\U0001d49f", + "Dstrok;": u"\u0110", + "ENG;": u"\u014a", + "ETH": u"\xd0", + "ETH;": u"\xd0", + "Eacute": u"\xc9", + "Eacute;": u"\xc9", + "Ecaron;": u"\u011a", + "Ecirc": u"\xca", + "Ecirc;": u"\xca", + "Ecy;": u"\u042d", + "Edot;": u"\u0116", + "Efr;": u"\U0001d508", + "Egrave": u"\xc8", + "Egrave;": u"\xc8", + "Element;": u"\u2208", + "Emacr;": u"\u0112", + "EmptySmallSquare;": u"\u25fb", + "EmptyVerySmallSquare;": u"\u25ab", + "Eogon;": u"\u0118", + "Eopf;": u"\U0001d53c", + "Epsilon;": u"\u0395", + "Equal;": u"\u2a75", + "EqualTilde;": u"\u2242", + "Equilibrium;": u"\u21cc", + "Escr;": u"\u2130", + "Esim;": u"\u2a73", + "Eta;": u"\u0397", + "Euml": u"\xcb", + "Euml;": u"\xcb", + "Exists;": u"\u2203", + "ExponentialE;": u"\u2147", + "Fcy;": u"\u0424", + "Ffr;": u"\U0001d509", + "FilledSmallSquare;": u"\u25fc", + "FilledVerySmallSquare;": u"\u25aa", + "Fopf;": u"\U0001d53d", + "ForAll;": u"\u2200", + "Fouriertrf;": u"\u2131", + "Fscr;": u"\u2131", + "GJcy;": u"\u0403", + "GT": u">", + "GT;": u">", + "Gamma;": u"\u0393", + "Gammad;": u"\u03dc", + "Gbreve;": u"\u011e", + "Gcedil;": u"\u0122", + "Gcirc;": u"\u011c", + "Gcy;": u"\u0413", + "Gdot;": u"\u0120", + "Gfr;": u"\U0001d50a", + "Gg;": u"\u22d9", + "Gopf;": u"\U0001d53e", + "GreaterEqual;": u"\u2265", + "GreaterEqualLess;": u"\u22db", + "GreaterFullEqual;": u"\u2267", + "GreaterGreater;": u"\u2aa2", + "GreaterLess;": u"\u2277", + "GreaterSlantEqual;": u"\u2a7e", + "GreaterTilde;": u"\u2273", + "Gscr;": u"\U0001d4a2", + "Gt;": u"\u226b", + "HARDcy;": u"\u042a", + "Hacek;": u"\u02c7", + "Hat;": u"^", + "Hcirc;": u"\u0124", + "Hfr;": u"\u210c", + "HilbertSpace;": u"\u210b", + "Hopf;": u"\u210d", + "HorizontalLine;": u"\u2500", + "Hscr;": u"\u210b", + "Hstrok;": u"\u0126", + "HumpDownHump;": u"\u224e", + "HumpEqual;": u"\u224f", + "IEcy;": u"\u0415", + "IJlig;": u"\u0132", + "IOcy;": u"\u0401", + "Iacute": u"\xcd", + "Iacute;": u"\xcd", + "Icirc": u"\xce", + "Icirc;": u"\xce", + "Icy;": u"\u0418", + "Idot;": u"\u0130", + "Ifr;": u"\u2111", + "Igrave": u"\xcc", + "Igrave;": u"\xcc", + "Im;": u"\u2111", + "Imacr;": u"\u012a", + "ImaginaryI;": u"\u2148", + "Implies;": u"\u21d2", + "Int;": u"\u222c", + "Integral;": u"\u222b", + "Intersection;": u"\u22c2", + "InvisibleComma;": u"\u2063", + "InvisibleTimes;": u"\u2062", + "Iogon;": u"\u012e", + "Iopf;": u"\U0001d540", + "Iota;": u"\u0399", + "Iscr;": u"\u2110", + "Itilde;": u"\u0128", + "Iukcy;": u"\u0406", + "Iuml": u"\xcf", + "Iuml;": u"\xcf", + "Jcirc;": u"\u0134", + "Jcy;": u"\u0419", + "Jfr;": u"\U0001d50d", + "Jopf;": u"\U0001d541", + "Jscr;": u"\U0001d4a5", + "Jsercy;": u"\u0408", + "Jukcy;": u"\u0404", + "KHcy;": u"\u0425", + "KJcy;": u"\u040c", + "Kappa;": u"\u039a", + "Kcedil;": u"\u0136", + "Kcy;": u"\u041a", + "Kfr;": u"\U0001d50e", + "Kopf;": u"\U0001d542", + "Kscr;": u"\U0001d4a6", + "LJcy;": u"\u0409", + "LT": u"<", + "LT;": u"<", + "Lacute;": u"\u0139", + "Lambda;": u"\u039b", + "Lang;": u"\u27ea", + "Laplacetrf;": u"\u2112", + "Larr;": u"\u219e", + "Lcaron;": u"\u013d", + "Lcedil;": u"\u013b", + "Lcy;": u"\u041b", + "LeftAngleBracket;": u"\u27e8", + "LeftArrow;": u"\u2190", + "LeftArrowBar;": u"\u21e4", + "LeftArrowRightArrow;": u"\u21c6", + "LeftCeiling;": u"\u2308", + "LeftDoubleBracket;": u"\u27e6", + "LeftDownTeeVector;": u"\u2961", + "LeftDownVector;": u"\u21c3", + "LeftDownVectorBar;": u"\u2959", + "LeftFloor;": u"\u230a", + "LeftRightArrow;": u"\u2194", + "LeftRightVector;": u"\u294e", + "LeftTee;": u"\u22a3", + "LeftTeeArrow;": u"\u21a4", + "LeftTeeVector;": u"\u295a", + "LeftTriangle;": u"\u22b2", + "LeftTriangleBar;": u"\u29cf", + "LeftTriangleEqual;": u"\u22b4", + "LeftUpDownVector;": u"\u2951", + "LeftUpTeeVector;": u"\u2960", + "LeftUpVector;": u"\u21bf", + "LeftUpVectorBar;": u"\u2958", + "LeftVector;": u"\u21bc", + "LeftVectorBar;": u"\u2952", + "Leftarrow;": u"\u21d0", + "Leftrightarrow;": u"\u21d4", + "LessEqualGreater;": u"\u22da", + "LessFullEqual;": u"\u2266", + "LessGreater;": u"\u2276", + "LessLess;": u"\u2aa1", + "LessSlantEqual;": u"\u2a7d", + "LessTilde;": u"\u2272", + "Lfr;": u"\U0001d50f", + "Ll;": u"\u22d8", + "Lleftarrow;": u"\u21da", + "Lmidot;": u"\u013f", + "LongLeftArrow;": u"\u27f5", + "LongLeftRightArrow;": u"\u27f7", + "LongRightArrow;": u"\u27f6", + "Longleftarrow;": u"\u27f8", + "Longleftrightarrow;": u"\u27fa", + "Longrightarrow;": u"\u27f9", + "Lopf;": u"\U0001d543", + "LowerLeftArrow;": u"\u2199", + "LowerRightArrow;": u"\u2198", + "Lscr;": u"\u2112", + "Lsh;": u"\u21b0", + "Lstrok;": u"\u0141", + "Lt;": u"\u226a", + "Map;": u"\u2905", + "Mcy;": u"\u041c", + "MediumSpace;": u"\u205f", + "Mellintrf;": u"\u2133", + "Mfr;": u"\U0001d510", + "MinusPlus;": u"\u2213", + "Mopf;": u"\U0001d544", + "Mscr;": u"\u2133", + "Mu;": u"\u039c", + "NJcy;": u"\u040a", + "Nacute;": u"\u0143", + "Ncaron;": u"\u0147", + "Ncedil;": u"\u0145", + "Ncy;": u"\u041d", + "NegativeMediumSpace;": u"\u200b", + "NegativeThickSpace;": u"\u200b", + "NegativeThinSpace;": u"\u200b", + "NegativeVeryThinSpace;": u"\u200b", + "NestedGreaterGreater;": u"\u226b", + "NestedLessLess;": u"\u226a", + "NewLine;": u"\n", + "Nfr;": u"\U0001d511", + "NoBreak;": u"\u2060", + "NonBreakingSpace;": u"\xa0", + "Nopf;": u"\u2115", + "Not;": u"\u2aec", + "NotCongruent;": u"\u2262", + "NotCupCap;": u"\u226d", + "NotDoubleVerticalBar;": u"\u2226", + "NotElement;": u"\u2209", + "NotEqual;": u"\u2260", + "NotEqualTilde;": u"\u2242\u0338", + "NotExists;": u"\u2204", + "NotGreater;": u"\u226f", + "NotGreaterEqual;": u"\u2271", + "NotGreaterFullEqual;": u"\u2267\u0338", + "NotGreaterGreater;": u"\u226b\u0338", + "NotGreaterLess;": u"\u2279", + "NotGreaterSlantEqual;": u"\u2a7e\u0338", + "NotGreaterTilde;": u"\u2275", + "NotHumpDownHump;": u"\u224e\u0338", + "NotHumpEqual;": u"\u224f\u0338", + "NotLeftTriangle;": u"\u22ea", + "NotLeftTriangleBar;": u"\u29cf\u0338", + "NotLeftTriangleEqual;": u"\u22ec", + "NotLess;": u"\u226e", + "NotLessEqual;": u"\u2270", + "NotLessGreater;": u"\u2278", + "NotLessLess;": u"\u226a\u0338", + "NotLessSlantEqual;": u"\u2a7d\u0338", + "NotLessTilde;": u"\u2274", + "NotNestedGreaterGreater;": u"\u2aa2\u0338", + "NotNestedLessLess;": u"\u2aa1\u0338", + "NotPrecedes;": u"\u2280", + "NotPrecedesEqual;": u"\u2aaf\u0338", + "NotPrecedesSlantEqual;": u"\u22e0", + "NotReverseElement;": u"\u220c", + "NotRightTriangle;": u"\u22eb", + "NotRightTriangleBar;": u"\u29d0\u0338", + "NotRightTriangleEqual;": u"\u22ed", + "NotSquareSubset;": u"\u228f\u0338", + "NotSquareSubsetEqual;": u"\u22e2", + "NotSquareSuperset;": u"\u2290\u0338", + "NotSquareSupersetEqual;": u"\u22e3", + "NotSubset;": u"\u2282\u20d2", + "NotSubsetEqual;": u"\u2288", + "NotSucceeds;": u"\u2281", + "NotSucceedsEqual;": u"\u2ab0\u0338", + "NotSucceedsSlantEqual;": u"\u22e1", + "NotSucceedsTilde;": u"\u227f\u0338", + "NotSuperset;": u"\u2283\u20d2", + "NotSupersetEqual;": u"\u2289", + "NotTilde;": u"\u2241", + "NotTildeEqual;": u"\u2244", + "NotTildeFullEqual;": u"\u2247", + "NotTildeTilde;": u"\u2249", + "NotVerticalBar;": u"\u2224", + "Nscr;": u"\U0001d4a9", + "Ntilde": u"\xd1", + "Ntilde;": u"\xd1", + "Nu;": u"\u039d", + "OElig;": u"\u0152", + "Oacute": u"\xd3", + "Oacute;": u"\xd3", + "Ocirc": u"\xd4", + "Ocirc;": u"\xd4", + "Ocy;": u"\u041e", + "Odblac;": u"\u0150", + "Ofr;": u"\U0001d512", + "Ograve": u"\xd2", + "Ograve;": u"\xd2", + "Omacr;": u"\u014c", + "Omega;": u"\u03a9", + "Omicron;": u"\u039f", + "Oopf;": u"\U0001d546", + "OpenCurlyDoubleQuote;": u"\u201c", + "OpenCurlyQuote;": u"\u2018", + "Or;": u"\u2a54", + "Oscr;": u"\U0001d4aa", + "Oslash": u"\xd8", + "Oslash;": u"\xd8", + "Otilde": u"\xd5", + "Otilde;": u"\xd5", + "Otimes;": u"\u2a37", + "Ouml": u"\xd6", + "Ouml;": u"\xd6", + "OverBar;": u"\u203e", + "OverBrace;": u"\u23de", + "OverBracket;": u"\u23b4", + "OverParenthesis;": u"\u23dc", + "PartialD;": u"\u2202", + "Pcy;": u"\u041f", + "Pfr;": u"\U0001d513", + "Phi;": u"\u03a6", + "Pi;": u"\u03a0", + "PlusMinus;": u"\xb1", + "Poincareplane;": u"\u210c", + "Popf;": u"\u2119", + "Pr;": u"\u2abb", + "Precedes;": u"\u227a", + "PrecedesEqual;": u"\u2aaf", + "PrecedesSlantEqual;": u"\u227c", + "PrecedesTilde;": u"\u227e", + "Prime;": u"\u2033", + "Product;": u"\u220f", + "Proportion;": u"\u2237", + "Proportional;": u"\u221d", + "Pscr;": u"\U0001d4ab", + "Psi;": u"\u03a8", + "QUOT": u"\"", + "QUOT;": u"\"", + "Qfr;": u"\U0001d514", + "Qopf;": u"\u211a", + "Qscr;": u"\U0001d4ac", + "RBarr;": u"\u2910", + "REG": u"\xae", + "REG;": u"\xae", + "Racute;": u"\u0154", + "Rang;": u"\u27eb", + "Rarr;": u"\u21a0", + "Rarrtl;": u"\u2916", + "Rcaron;": u"\u0158", + "Rcedil;": u"\u0156", + "Rcy;": u"\u0420", + "Re;": u"\u211c", + "ReverseElement;": u"\u220b", + "ReverseEquilibrium;": u"\u21cb", + "ReverseUpEquilibrium;": u"\u296f", + "Rfr;": u"\u211c", + "Rho;": u"\u03a1", + "RightAngleBracket;": u"\u27e9", + "RightArrow;": u"\u2192", + "RightArrowBar;": u"\u21e5", + "RightArrowLeftArrow;": u"\u21c4", + "RightCeiling;": u"\u2309", + "RightDoubleBracket;": u"\u27e7", + "RightDownTeeVector;": u"\u295d", + "RightDownVector;": u"\u21c2", + "RightDownVectorBar;": u"\u2955", + "RightFloor;": u"\u230b", + "RightTee;": u"\u22a2", + "RightTeeArrow;": u"\u21a6", + "RightTeeVector;": u"\u295b", + "RightTriangle;": u"\u22b3", + "RightTriangleBar;": u"\u29d0", + "RightTriangleEqual;": u"\u22b5", + "RightUpDownVector;": u"\u294f", + "RightUpTeeVector;": u"\u295c", + "RightUpVector;": u"\u21be", + "RightUpVectorBar;": u"\u2954", + "RightVector;": u"\u21c0", + "RightVectorBar;": u"\u2953", + "Rightarrow;": u"\u21d2", + "Ropf;": u"\u211d", + "RoundImplies;": u"\u2970", + "Rrightarrow;": u"\u21db", + "Rscr;": u"\u211b", + "Rsh;": u"\u21b1", + "RuleDelayed;": u"\u29f4", + "SHCHcy;": u"\u0429", + "SHcy;": u"\u0428", + "SOFTcy;": u"\u042c", + "Sacute;": u"\u015a", + "Sc;": u"\u2abc", + "Scaron;": u"\u0160", + "Scedil;": u"\u015e", + "Scirc;": u"\u015c", + "Scy;": u"\u0421", + "Sfr;": u"\U0001d516", + "ShortDownArrow;": u"\u2193", + "ShortLeftArrow;": u"\u2190", + "ShortRightArrow;": u"\u2192", + "ShortUpArrow;": u"\u2191", + "Sigma;": u"\u03a3", + "SmallCircle;": u"\u2218", + "Sopf;": u"\U0001d54a", + "Sqrt;": u"\u221a", + "Square;": u"\u25a1", + "SquareIntersection;": u"\u2293", + "SquareSubset;": u"\u228f", + "SquareSubsetEqual;": u"\u2291", + "SquareSuperset;": u"\u2290", + "SquareSupersetEqual;": u"\u2292", + "SquareUnion;": u"\u2294", + "Sscr;": u"\U0001d4ae", + "Star;": u"\u22c6", + "Sub;": u"\u22d0", + "Subset;": u"\u22d0", + "SubsetEqual;": u"\u2286", + "Succeeds;": u"\u227b", + "SucceedsEqual;": u"\u2ab0", + "SucceedsSlantEqual;": u"\u227d", + "SucceedsTilde;": u"\u227f", + "SuchThat;": u"\u220b", + "Sum;": u"\u2211", + "Sup;": u"\u22d1", + "Superset;": u"\u2283", + "SupersetEqual;": u"\u2287", + "Supset;": u"\u22d1", + "THORN": u"\xde", + "THORN;": u"\xde", + "TRADE;": u"\u2122", + "TSHcy;": u"\u040b", + "TScy;": u"\u0426", + "Tab;": u"\t", + "Tau;": u"\u03a4", + "Tcaron;": u"\u0164", + "Tcedil;": u"\u0162", + "Tcy;": u"\u0422", + "Tfr;": u"\U0001d517", + "Therefore;": u"\u2234", + "Theta;": u"\u0398", + "ThickSpace;": u"\u205f\u200a", + "ThinSpace;": u"\u2009", + "Tilde;": u"\u223c", + "TildeEqual;": u"\u2243", + "TildeFullEqual;": u"\u2245", + "TildeTilde;": u"\u2248", + "Topf;": u"\U0001d54b", + "TripleDot;": u"\u20db", + "Tscr;": u"\U0001d4af", + "Tstrok;": u"\u0166", + "Uacute": u"\xda", + "Uacute;": u"\xda", + "Uarr;": u"\u219f", + "Uarrocir;": u"\u2949", + "Ubrcy;": u"\u040e", + "Ubreve;": u"\u016c", + "Ucirc": u"\xdb", + "Ucirc;": u"\xdb", + "Ucy;": u"\u0423", + "Udblac;": u"\u0170", + "Ufr;": u"\U0001d518", + "Ugrave": u"\xd9", + "Ugrave;": u"\xd9", + "Umacr;": u"\u016a", + "UnderBar;": u"_", + "UnderBrace;": u"\u23df", + "UnderBracket;": u"\u23b5", + "UnderParenthesis;": u"\u23dd", + "Union;": u"\u22c3", + "UnionPlus;": u"\u228e", + "Uogon;": u"\u0172", + "Uopf;": u"\U0001d54c", + "UpArrow;": u"\u2191", + "UpArrowBar;": u"\u2912", + "UpArrowDownArrow;": u"\u21c5", + "UpDownArrow;": u"\u2195", + "UpEquilibrium;": u"\u296e", + "UpTee;": u"\u22a5", + "UpTeeArrow;": u"\u21a5", + "Uparrow;": u"\u21d1", + "Updownarrow;": u"\u21d5", + "UpperLeftArrow;": u"\u2196", + "UpperRightArrow;": u"\u2197", + "Upsi;": u"\u03d2", + "Upsilon;": u"\u03a5", + "Uring;": u"\u016e", + "Uscr;": u"\U0001d4b0", + "Utilde;": u"\u0168", + "Uuml": u"\xdc", + "Uuml;": u"\xdc", + "VDash;": u"\u22ab", + "Vbar;": u"\u2aeb", + "Vcy;": u"\u0412", + "Vdash;": u"\u22a9", + "Vdashl;": u"\u2ae6", + "Vee;": u"\u22c1", + "Verbar;": u"\u2016", + "Vert;": u"\u2016", + "VerticalBar;": u"\u2223", + "VerticalLine;": u"|", + "VerticalSeparator;": u"\u2758", + "VerticalTilde;": u"\u2240", + "VeryThinSpace;": u"\u200a", + "Vfr;": u"\U0001d519", + "Vopf;": u"\U0001d54d", + "Vscr;": u"\U0001d4b1", + "Vvdash;": u"\u22aa", + "Wcirc;": u"\u0174", + "Wedge;": u"\u22c0", + "Wfr;": u"\U0001d51a", + "Wopf;": u"\U0001d54e", + "Wscr;": u"\U0001d4b2", + "Xfr;": u"\U0001d51b", + "Xi;": u"\u039e", + "Xopf;": u"\U0001d54f", + "Xscr;": u"\U0001d4b3", + "YAcy;": u"\u042f", + "YIcy;": u"\u0407", + "YUcy;": u"\u042e", + "Yacute": u"\xdd", + "Yacute;": u"\xdd", + "Ycirc;": u"\u0176", + "Ycy;": u"\u042b", + "Yfr;": u"\U0001d51c", + "Yopf;": u"\U0001d550", + "Yscr;": u"\U0001d4b4", + "Yuml;": u"\u0178", + "ZHcy;": u"\u0416", + "Zacute;": u"\u0179", + "Zcaron;": u"\u017d", + "Zcy;": u"\u0417", + "Zdot;": u"\u017b", + "ZeroWidthSpace;": u"\u200b", + "Zeta;": u"\u0396", + "Zfr;": u"\u2128", + "Zopf;": u"\u2124", + "Zscr;": u"\U0001d4b5", + "aacute": u"\xe1", + "aacute;": u"\xe1", + "abreve;": u"\u0103", + "ac;": u"\u223e", + "acE;": u"\u223e\u0333", + "acd;": u"\u223f", + "acirc": u"\xe2", + "acirc;": u"\xe2", + "acute": u"\xb4", + "acute;": u"\xb4", + "acy;": u"\u0430", + "aelig": u"\xe6", + "aelig;": u"\xe6", + "af;": u"\u2061", + "afr;": u"\U0001d51e", + "agrave": u"\xe0", + "agrave;": u"\xe0", + "alefsym;": u"\u2135", + "aleph;": u"\u2135", + "alpha;": u"\u03b1", + "amacr;": u"\u0101", + "amalg;": u"\u2a3f", + "amp": u"&", + "amp;": u"&", + "and;": u"\u2227", + "andand;": u"\u2a55", + "andd;": u"\u2a5c", + "andslope;": u"\u2a58", + "andv;": u"\u2a5a", + "ang;": u"\u2220", + "ange;": u"\u29a4", + "angle;": u"\u2220", + "angmsd;": u"\u2221", + "angmsdaa;": u"\u29a8", + "angmsdab;": u"\u29a9", + "angmsdac;": u"\u29aa", + "angmsdad;": u"\u29ab", + "angmsdae;": u"\u29ac", + "angmsdaf;": u"\u29ad", + "angmsdag;": u"\u29ae", + "angmsdah;": u"\u29af", + "angrt;": u"\u221f", + "angrtvb;": u"\u22be", + "angrtvbd;": u"\u299d", + "angsph;": u"\u2222", + "angst;": u"\xc5", + "angzarr;": u"\u237c", + "aogon;": u"\u0105", + "aopf;": u"\U0001d552", + "ap;": u"\u2248", + "apE;": u"\u2a70", + "apacir;": u"\u2a6f", + "ape;": u"\u224a", + "apid;": u"\u224b", + "apos;": u"'", + "approx;": u"\u2248", + "approxeq;": u"\u224a", + "aring": u"\xe5", + "aring;": u"\xe5", + "ascr;": u"\U0001d4b6", + "ast;": u"*", + "asymp;": u"\u2248", + "asympeq;": u"\u224d", + "atilde": u"\xe3", + "atilde;": u"\xe3", + "auml": u"\xe4", + "auml;": u"\xe4", + "awconint;": u"\u2233", + "awint;": u"\u2a11", + "bNot;": u"\u2aed", + "backcong;": u"\u224c", + "backepsilon;": u"\u03f6", + "backprime;": u"\u2035", + "backsim;": u"\u223d", + "backsimeq;": u"\u22cd", + "barvee;": u"\u22bd", + "barwed;": u"\u2305", + "barwedge;": u"\u2305", + "bbrk;": u"\u23b5", + "bbrktbrk;": u"\u23b6", + "bcong;": u"\u224c", + "bcy;": u"\u0431", + "bdquo;": u"\u201e", + "becaus;": u"\u2235", + "because;": u"\u2235", + "bemptyv;": u"\u29b0", + "bepsi;": u"\u03f6", + "bernou;": u"\u212c", + "beta;": u"\u03b2", + "beth;": u"\u2136", + "between;": u"\u226c", + "bfr;": u"\U0001d51f", + "bigcap;": u"\u22c2", + "bigcirc;": u"\u25ef", + "bigcup;": u"\u22c3", + "bigodot;": u"\u2a00", + "bigoplus;": u"\u2a01", + "bigotimes;": u"\u2a02", + "bigsqcup;": u"\u2a06", + "bigstar;": u"\u2605", + "bigtriangledown;": u"\u25bd", + "bigtriangleup;": u"\u25b3", + "biguplus;": u"\u2a04", + "bigvee;": u"\u22c1", + "bigwedge;": u"\u22c0", + "bkarow;": u"\u290d", + "blacklozenge;": u"\u29eb", + "blacksquare;": u"\u25aa", + "blacktriangle;": u"\u25b4", + "blacktriangledown;": u"\u25be", + "blacktriangleleft;": u"\u25c2", + "blacktriangleright;": u"\u25b8", + "blank;": u"\u2423", + "blk12;": u"\u2592", + "blk14;": u"\u2591", + "blk34;": u"\u2593", + "block;": u"\u2588", + "bne;": u"=\u20e5", + "bnequiv;": u"\u2261\u20e5", + "bnot;": u"\u2310", + "bopf;": u"\U0001d553", + "bot;": u"\u22a5", + "bottom;": u"\u22a5", + "bowtie;": u"\u22c8", + "boxDL;": u"\u2557", + "boxDR;": u"\u2554", + "boxDl;": u"\u2556", + "boxDr;": u"\u2553", + "boxH;": u"\u2550", + "boxHD;": u"\u2566", + "boxHU;": u"\u2569", + "boxHd;": u"\u2564", + "boxHu;": u"\u2567", + "boxUL;": u"\u255d", + "boxUR;": u"\u255a", + "boxUl;": u"\u255c", + "boxUr;": u"\u2559", + "boxV;": u"\u2551", + "boxVH;": u"\u256c", + "boxVL;": u"\u2563", + "boxVR;": u"\u2560", + "boxVh;": u"\u256b", + "boxVl;": u"\u2562", + "boxVr;": u"\u255f", + "boxbox;": u"\u29c9", + "boxdL;": u"\u2555", + "boxdR;": u"\u2552", + "boxdl;": u"\u2510", + "boxdr;": u"\u250c", + "boxh;": u"\u2500", + "boxhD;": u"\u2565", + "boxhU;": u"\u2568", + "boxhd;": u"\u252c", + "boxhu;": u"\u2534", + "boxminus;": u"\u229f", + "boxplus;": u"\u229e", + "boxtimes;": u"\u22a0", + "boxuL;": u"\u255b", + "boxuR;": u"\u2558", + "boxul;": u"\u2518", + "boxur;": u"\u2514", + "boxv;": u"\u2502", + "boxvH;": u"\u256a", + "boxvL;": u"\u2561", + "boxvR;": u"\u255e", + "boxvh;": u"\u253c", + "boxvl;": u"\u2524", + "boxvr;": u"\u251c", + "bprime;": u"\u2035", + "breve;": u"\u02d8", + "brvbar": u"\xa6", + "brvbar;": u"\xa6", + "bscr;": u"\U0001d4b7", + "bsemi;": u"\u204f", + "bsim;": u"\u223d", + "bsime;": u"\u22cd", + "bsol;": u"\\", + "bsolb;": u"\u29c5", + "bsolhsub;": u"\u27c8", + "bull;": u"\u2022", + "bullet;": u"\u2022", + "bump;": u"\u224e", + "bumpE;": u"\u2aae", + "bumpe;": u"\u224f", + "bumpeq;": u"\u224f", + "cacute;": u"\u0107", + "cap;": u"\u2229", + "capand;": u"\u2a44", + "capbrcup;": u"\u2a49", + "capcap;": u"\u2a4b", + "capcup;": u"\u2a47", + "capdot;": u"\u2a40", + "caps;": u"\u2229\ufe00", + "caret;": u"\u2041", + "caron;": u"\u02c7", + "ccaps;": u"\u2a4d", + "ccaron;": u"\u010d", + "ccedil": u"\xe7", + "ccedil;": u"\xe7", + "ccirc;": u"\u0109", + "ccups;": u"\u2a4c", + "ccupssm;": u"\u2a50", + "cdot;": u"\u010b", + "cedil": u"\xb8", + "cedil;": u"\xb8", + "cemptyv;": u"\u29b2", + "cent": u"\xa2", + "cent;": u"\xa2", + "centerdot;": u"\xb7", + "cfr;": u"\U0001d520", + "chcy;": u"\u0447", + "check;": u"\u2713", + "checkmark;": u"\u2713", + "chi;": u"\u03c7", + "cir;": u"\u25cb", + "cirE;": u"\u29c3", + "circ;": u"\u02c6", + "circeq;": u"\u2257", + "circlearrowleft;": u"\u21ba", + "circlearrowright;": u"\u21bb", + "circledR;": u"\xae", + "circledS;": u"\u24c8", + "circledast;": u"\u229b", + "circledcirc;": u"\u229a", + "circleddash;": u"\u229d", + "cire;": u"\u2257", + "cirfnint;": u"\u2a10", + "cirmid;": u"\u2aef", + "cirscir;": u"\u29c2", + "clubs;": u"\u2663", + "clubsuit;": u"\u2663", + "colon;": u":", + "colone;": u"\u2254", + "coloneq;": u"\u2254", + "comma;": u",", + "commat;": u"@", + "comp;": u"\u2201", + "compfn;": u"\u2218", + "complement;": u"\u2201", + "complexes;": u"\u2102", + "cong;": u"\u2245", + "congdot;": u"\u2a6d", + "conint;": u"\u222e", + "copf;": u"\U0001d554", + "coprod;": u"\u2210", + "copy": u"\xa9", + "copy;": u"\xa9", + "copysr;": u"\u2117", + "crarr;": u"\u21b5", + "cross;": u"\u2717", + "cscr;": u"\U0001d4b8", + "csub;": u"\u2acf", + "csube;": u"\u2ad1", + "csup;": u"\u2ad0", + "csupe;": u"\u2ad2", + "ctdot;": u"\u22ef", + "cudarrl;": u"\u2938", + "cudarrr;": u"\u2935", + "cuepr;": u"\u22de", + "cuesc;": u"\u22df", + "cularr;": u"\u21b6", + "cularrp;": u"\u293d", + "cup;": u"\u222a", + "cupbrcap;": u"\u2a48", + "cupcap;": u"\u2a46", + "cupcup;": u"\u2a4a", + "cupdot;": u"\u228d", + "cupor;": u"\u2a45", + "cups;": u"\u222a\ufe00", + "curarr;": u"\u21b7", + "curarrm;": u"\u293c", + "curlyeqprec;": u"\u22de", + "curlyeqsucc;": u"\u22df", + "curlyvee;": u"\u22ce", + "curlywedge;": u"\u22cf", + "curren": u"\xa4", + "curren;": u"\xa4", + "curvearrowleft;": u"\u21b6", + "curvearrowright;": u"\u21b7", + "cuvee;": u"\u22ce", + "cuwed;": u"\u22cf", + "cwconint;": u"\u2232", + "cwint;": u"\u2231", + "cylcty;": u"\u232d", + "dArr;": u"\u21d3", + "dHar;": u"\u2965", + "dagger;": u"\u2020", + "daleth;": u"\u2138", + "darr;": u"\u2193", + "dash;": u"\u2010", + "dashv;": u"\u22a3", + "dbkarow;": u"\u290f", + "dblac;": u"\u02dd", + "dcaron;": u"\u010f", + "dcy;": u"\u0434", + "dd;": u"\u2146", + "ddagger;": u"\u2021", + "ddarr;": u"\u21ca", + "ddotseq;": u"\u2a77", + "deg": u"\xb0", + "deg;": u"\xb0", + "delta;": u"\u03b4", + "demptyv;": u"\u29b1", + "dfisht;": u"\u297f", + "dfr;": u"\U0001d521", + "dharl;": u"\u21c3", + "dharr;": u"\u21c2", + "diam;": u"\u22c4", + "diamond;": u"\u22c4", + "diamondsuit;": u"\u2666", + "diams;": u"\u2666", + "die;": u"\xa8", + "digamma;": u"\u03dd", + "disin;": u"\u22f2", + "div;": u"\xf7", + "divide": u"\xf7", + "divide;": u"\xf7", + "divideontimes;": u"\u22c7", + "divonx;": u"\u22c7", + "djcy;": u"\u0452", + "dlcorn;": u"\u231e", + "dlcrop;": u"\u230d", + "dollar;": u"$", + "dopf;": u"\U0001d555", + "dot;": u"\u02d9", + "doteq;": u"\u2250", + "doteqdot;": u"\u2251", + "dotminus;": u"\u2238", + "dotplus;": u"\u2214", + "dotsquare;": u"\u22a1", + "doublebarwedge;": u"\u2306", + "downarrow;": u"\u2193", + "downdownarrows;": u"\u21ca", + "downharpoonleft;": u"\u21c3", + "downharpoonright;": u"\u21c2", + "drbkarow;": u"\u2910", + "drcorn;": u"\u231f", + "drcrop;": u"\u230c", + "dscr;": u"\U0001d4b9", + "dscy;": u"\u0455", + "dsol;": u"\u29f6", + "dstrok;": u"\u0111", + "dtdot;": u"\u22f1", + "dtri;": u"\u25bf", + "dtrif;": u"\u25be", + "duarr;": u"\u21f5", + "duhar;": u"\u296f", + "dwangle;": u"\u29a6", + "dzcy;": u"\u045f", + "dzigrarr;": u"\u27ff", + "eDDot;": u"\u2a77", + "eDot;": u"\u2251", + "eacute": u"\xe9", + "eacute;": u"\xe9", + "easter;": u"\u2a6e", + "ecaron;": u"\u011b", + "ecir;": u"\u2256", + "ecirc": u"\xea", + "ecirc;": u"\xea", + "ecolon;": u"\u2255", + "ecy;": u"\u044d", + "edot;": u"\u0117", + "ee;": u"\u2147", + "efDot;": u"\u2252", + "efr;": u"\U0001d522", + "eg;": u"\u2a9a", + "egrave": u"\xe8", + "egrave;": u"\xe8", + "egs;": u"\u2a96", + "egsdot;": u"\u2a98", + "el;": u"\u2a99", + "elinters;": u"\u23e7", + "ell;": u"\u2113", + "els;": u"\u2a95", + "elsdot;": u"\u2a97", + "emacr;": u"\u0113", + "empty;": u"\u2205", + "emptyset;": u"\u2205", + "emptyv;": u"\u2205", + "emsp13;": u"\u2004", + "emsp14;": u"\u2005", + "emsp;": u"\u2003", + "eng;": u"\u014b", + "ensp;": u"\u2002", + "eogon;": u"\u0119", + "eopf;": u"\U0001d556", + "epar;": u"\u22d5", + "eparsl;": u"\u29e3", + "eplus;": u"\u2a71", + "epsi;": u"\u03b5", + "epsilon;": u"\u03b5", + "epsiv;": u"\u03f5", + "eqcirc;": u"\u2256", + "eqcolon;": u"\u2255", + "eqsim;": u"\u2242", + "eqslantgtr;": u"\u2a96", + "eqslantless;": u"\u2a95", + "equals;": u"=", + "equest;": u"\u225f", + "equiv;": u"\u2261", + "equivDD;": u"\u2a78", + "eqvparsl;": u"\u29e5", + "erDot;": u"\u2253", + "erarr;": u"\u2971", + "escr;": u"\u212f", + "esdot;": u"\u2250", + "esim;": u"\u2242", + "eta;": u"\u03b7", + "eth": u"\xf0", + "eth;": u"\xf0", + "euml": u"\xeb", + "euml;": u"\xeb", + "euro;": u"\u20ac", + "excl;": u"!", + "exist;": u"\u2203", + "expectation;": u"\u2130", + "exponentiale;": u"\u2147", + "fallingdotseq;": u"\u2252", + "fcy;": u"\u0444", + "female;": u"\u2640", + "ffilig;": u"\ufb03", + "fflig;": u"\ufb00", + "ffllig;": u"\ufb04", + "ffr;": u"\U0001d523", + "filig;": u"\ufb01", + "fjlig;": u"fj", + "flat;": u"\u266d", + "fllig;": u"\ufb02", + "fltns;": u"\u25b1", + "fnof;": u"\u0192", + "fopf;": u"\U0001d557", + "forall;": u"\u2200", + "fork;": u"\u22d4", + "forkv;": u"\u2ad9", + "fpartint;": u"\u2a0d", + "frac12": u"\xbd", + "frac12;": u"\xbd", + "frac13;": u"\u2153", + "frac14": u"\xbc", + "frac14;": u"\xbc", + "frac15;": u"\u2155", + "frac16;": u"\u2159", + "frac18;": u"\u215b", + "frac23;": u"\u2154", + "frac25;": u"\u2156", + "frac34": u"\xbe", + "frac34;": u"\xbe", + "frac35;": u"\u2157", + "frac38;": u"\u215c", + "frac45;": u"\u2158", + "frac56;": u"\u215a", + "frac58;": u"\u215d", + "frac78;": u"\u215e", + "frasl;": u"\u2044", + "frown;": u"\u2322", + "fscr;": u"\U0001d4bb", + "gE;": u"\u2267", + "gEl;": u"\u2a8c", + "gacute;": u"\u01f5", + "gamma;": u"\u03b3", + "gammad;": u"\u03dd", + "gap;": u"\u2a86", + "gbreve;": u"\u011f", + "gcirc;": u"\u011d", + "gcy;": u"\u0433", + "gdot;": u"\u0121", + "ge;": u"\u2265", + "gel;": u"\u22db", + "geq;": u"\u2265", + "geqq;": u"\u2267", + "geqslant;": u"\u2a7e", + "ges;": u"\u2a7e", + "gescc;": u"\u2aa9", + "gesdot;": u"\u2a80", + "gesdoto;": u"\u2a82", + "gesdotol;": u"\u2a84", + "gesl;": u"\u22db\ufe00", + "gesles;": u"\u2a94", + "gfr;": u"\U0001d524", + "gg;": u"\u226b", + "ggg;": u"\u22d9", + "gimel;": u"\u2137", + "gjcy;": u"\u0453", + "gl;": u"\u2277", + "glE;": u"\u2a92", + "gla;": u"\u2aa5", + "glj;": u"\u2aa4", + "gnE;": u"\u2269", + "gnap;": u"\u2a8a", + "gnapprox;": u"\u2a8a", + "gne;": u"\u2a88", + "gneq;": u"\u2a88", + "gneqq;": u"\u2269", + "gnsim;": u"\u22e7", + "gopf;": u"\U0001d558", + "grave;": u"`", + "gscr;": u"\u210a", + "gsim;": u"\u2273", + "gsime;": u"\u2a8e", + "gsiml;": u"\u2a90", + "gt": u">", + "gt;": u">", + "gtcc;": u"\u2aa7", + "gtcir;": u"\u2a7a", + "gtdot;": u"\u22d7", + "gtlPar;": u"\u2995", + "gtquest;": u"\u2a7c", + "gtrapprox;": u"\u2a86", + "gtrarr;": u"\u2978", + "gtrdot;": u"\u22d7", + "gtreqless;": u"\u22db", + "gtreqqless;": u"\u2a8c", + "gtrless;": u"\u2277", + "gtrsim;": u"\u2273", + "gvertneqq;": u"\u2269\ufe00", + "gvnE;": u"\u2269\ufe00", + "hArr;": u"\u21d4", + "hairsp;": u"\u200a", + "half;": u"\xbd", + "hamilt;": u"\u210b", + "hardcy;": u"\u044a", + "harr;": u"\u2194", + "harrcir;": u"\u2948", + "harrw;": u"\u21ad", + "hbar;": u"\u210f", + "hcirc;": u"\u0125", + "hearts;": u"\u2665", + "heartsuit;": u"\u2665", + "hellip;": u"\u2026", + "hercon;": u"\u22b9", + "hfr;": u"\U0001d525", + "hksearow;": u"\u2925", + "hkswarow;": u"\u2926", + "hoarr;": u"\u21ff", + "homtht;": u"\u223b", + "hookleftarrow;": u"\u21a9", + "hookrightarrow;": u"\u21aa", + "hopf;": u"\U0001d559", + "horbar;": u"\u2015", + "hscr;": u"\U0001d4bd", + "hslash;": u"\u210f", + "hstrok;": u"\u0127", + "hybull;": u"\u2043", + "hyphen;": u"\u2010", + "iacute": u"\xed", + "iacute;": u"\xed", + "ic;": u"\u2063", + "icirc": u"\xee", + "icirc;": u"\xee", + "icy;": u"\u0438", + "iecy;": u"\u0435", + "iexcl": u"\xa1", + "iexcl;": u"\xa1", + "iff;": u"\u21d4", + "ifr;": u"\U0001d526", + "igrave": u"\xec", + "igrave;": u"\xec", + "ii;": u"\u2148", + "iiiint;": u"\u2a0c", + "iiint;": u"\u222d", + "iinfin;": u"\u29dc", + "iiota;": u"\u2129", + "ijlig;": u"\u0133", + "imacr;": u"\u012b", + "image;": u"\u2111", + "imagline;": u"\u2110", + "imagpart;": u"\u2111", + "imath;": u"\u0131", + "imof;": u"\u22b7", + "imped;": u"\u01b5", + "in;": u"\u2208", + "incare;": u"\u2105", + "infin;": u"\u221e", + "infintie;": u"\u29dd", + "inodot;": u"\u0131", + "int;": u"\u222b", + "intcal;": u"\u22ba", + "integers;": u"\u2124", + "intercal;": u"\u22ba", + "intlarhk;": u"\u2a17", + "intprod;": u"\u2a3c", + "iocy;": u"\u0451", + "iogon;": u"\u012f", + "iopf;": u"\U0001d55a", + "iota;": u"\u03b9", + "iprod;": u"\u2a3c", + "iquest": u"\xbf", + "iquest;": u"\xbf", + "iscr;": u"\U0001d4be", + "isin;": u"\u2208", + "isinE;": u"\u22f9", + "isindot;": u"\u22f5", + "isins;": u"\u22f4", + "isinsv;": u"\u22f3", + "isinv;": u"\u2208", + "it;": u"\u2062", + "itilde;": u"\u0129", + "iukcy;": u"\u0456", + "iuml": u"\xef", + "iuml;": u"\xef", + "jcirc;": u"\u0135", + "jcy;": u"\u0439", + "jfr;": u"\U0001d527", + "jmath;": u"\u0237", + "jopf;": u"\U0001d55b", + "jscr;": u"\U0001d4bf", + "jsercy;": u"\u0458", + "jukcy;": u"\u0454", + "kappa;": u"\u03ba", + "kappav;": u"\u03f0", + "kcedil;": u"\u0137", + "kcy;": u"\u043a", + "kfr;": u"\U0001d528", + "kgreen;": u"\u0138", + "khcy;": u"\u0445", + "kjcy;": u"\u045c", + "kopf;": u"\U0001d55c", + "kscr;": u"\U0001d4c0", + "lAarr;": u"\u21da", + "lArr;": u"\u21d0", + "lAtail;": u"\u291b", + "lBarr;": u"\u290e", + "lE;": u"\u2266", + "lEg;": u"\u2a8b", + "lHar;": u"\u2962", + "lacute;": u"\u013a", + "laemptyv;": u"\u29b4", + "lagran;": u"\u2112", + "lambda;": u"\u03bb", + "lang;": u"\u27e8", + "langd;": u"\u2991", + "langle;": u"\u27e8", + "lap;": u"\u2a85", + "laquo": u"\xab", + "laquo;": u"\xab", + "larr;": u"\u2190", + "larrb;": u"\u21e4", + "larrbfs;": u"\u291f", + "larrfs;": u"\u291d", + "larrhk;": u"\u21a9", + "larrlp;": u"\u21ab", + "larrpl;": u"\u2939", + "larrsim;": u"\u2973", + "larrtl;": u"\u21a2", + "lat;": u"\u2aab", + "latail;": u"\u2919", + "late;": u"\u2aad", + "lates;": u"\u2aad\ufe00", + "lbarr;": u"\u290c", + "lbbrk;": u"\u2772", + "lbrace;": u"{", + "lbrack;": u"[", + "lbrke;": u"\u298b", + "lbrksld;": u"\u298f", + "lbrkslu;": u"\u298d", + "lcaron;": u"\u013e", + "lcedil;": u"\u013c", + "lceil;": u"\u2308", + "lcub;": u"{", + "lcy;": u"\u043b", + "ldca;": u"\u2936", + "ldquo;": u"\u201c", + "ldquor;": u"\u201e", + "ldrdhar;": u"\u2967", + "ldrushar;": u"\u294b", + "ldsh;": u"\u21b2", + "le;": u"\u2264", + "leftarrow;": u"\u2190", + "leftarrowtail;": u"\u21a2", + "leftharpoondown;": u"\u21bd", + "leftharpoonup;": u"\u21bc", + "leftleftarrows;": u"\u21c7", + "leftrightarrow;": u"\u2194", + "leftrightarrows;": u"\u21c6", + "leftrightharpoons;": u"\u21cb", + "leftrightsquigarrow;": u"\u21ad", + "leftthreetimes;": u"\u22cb", + "leg;": u"\u22da", + "leq;": u"\u2264", + "leqq;": u"\u2266", + "leqslant;": u"\u2a7d", + "les;": u"\u2a7d", + "lescc;": u"\u2aa8", + "lesdot;": u"\u2a7f", + "lesdoto;": u"\u2a81", + "lesdotor;": u"\u2a83", + "lesg;": u"\u22da\ufe00", + "lesges;": u"\u2a93", + "lessapprox;": u"\u2a85", + "lessdot;": u"\u22d6", + "lesseqgtr;": u"\u22da", + "lesseqqgtr;": u"\u2a8b", + "lessgtr;": u"\u2276", + "lesssim;": u"\u2272", + "lfisht;": u"\u297c", + "lfloor;": u"\u230a", + "lfr;": u"\U0001d529", + "lg;": u"\u2276", + "lgE;": u"\u2a91", + "lhard;": u"\u21bd", + "lharu;": u"\u21bc", + "lharul;": u"\u296a", + "lhblk;": u"\u2584", + "ljcy;": u"\u0459", + "ll;": u"\u226a", + "llarr;": u"\u21c7", + "llcorner;": u"\u231e", + "llhard;": u"\u296b", + "lltri;": u"\u25fa", + "lmidot;": u"\u0140", + "lmoust;": u"\u23b0", + "lmoustache;": u"\u23b0", + "lnE;": u"\u2268", + "lnap;": u"\u2a89", + "lnapprox;": u"\u2a89", + "lne;": u"\u2a87", + "lneq;": u"\u2a87", + "lneqq;": u"\u2268", + "lnsim;": u"\u22e6", + "loang;": u"\u27ec", + "loarr;": u"\u21fd", + "lobrk;": u"\u27e6", + "longleftarrow;": u"\u27f5", + "longleftrightarrow;": u"\u27f7", + "longmapsto;": u"\u27fc", + "longrightarrow;": u"\u27f6", + "looparrowleft;": u"\u21ab", + "looparrowright;": u"\u21ac", + "lopar;": u"\u2985", + "lopf;": u"\U0001d55d", + "loplus;": u"\u2a2d", + "lotimes;": u"\u2a34", + "lowast;": u"\u2217", + "lowbar;": u"_", + "loz;": u"\u25ca", + "lozenge;": u"\u25ca", + "lozf;": u"\u29eb", + "lpar;": u"(", + "lparlt;": u"\u2993", + "lrarr;": u"\u21c6", + "lrcorner;": u"\u231f", + "lrhar;": u"\u21cb", + "lrhard;": u"\u296d", + "lrm;": u"\u200e", + "lrtri;": u"\u22bf", + "lsaquo;": u"\u2039", + "lscr;": u"\U0001d4c1", + "lsh;": u"\u21b0", + "lsim;": u"\u2272", + "lsime;": u"\u2a8d", + "lsimg;": u"\u2a8f", + "lsqb;": u"[", + "lsquo;": u"\u2018", + "lsquor;": u"\u201a", + "lstrok;": u"\u0142", + "lt": u"<", + "lt;": u"<", + "ltcc;": u"\u2aa6", + "ltcir;": u"\u2a79", + "ltdot;": u"\u22d6", + "lthree;": u"\u22cb", + "ltimes;": u"\u22c9", + "ltlarr;": u"\u2976", + "ltquest;": u"\u2a7b", + "ltrPar;": u"\u2996", + "ltri;": u"\u25c3", + "ltrie;": u"\u22b4", + "ltrif;": u"\u25c2", + "lurdshar;": u"\u294a", + "luruhar;": u"\u2966", + "lvertneqq;": u"\u2268\ufe00", + "lvnE;": u"\u2268\ufe00", + "mDDot;": u"\u223a", + "macr": u"\xaf", + "macr;": u"\xaf", + "male;": u"\u2642", + "malt;": u"\u2720", + "maltese;": u"\u2720", + "map;": u"\u21a6", + "mapsto;": u"\u21a6", + "mapstodown;": u"\u21a7", + "mapstoleft;": u"\u21a4", + "mapstoup;": u"\u21a5", + "marker;": u"\u25ae", + "mcomma;": u"\u2a29", + "mcy;": u"\u043c", + "mdash;": u"\u2014", + "measuredangle;": u"\u2221", + "mfr;": u"\U0001d52a", + "mho;": u"\u2127", + "micro": u"\xb5", + "micro;": u"\xb5", + "mid;": u"\u2223", + "midast;": u"*", + "midcir;": u"\u2af0", + "middot": u"\xb7", + "middot;": u"\xb7", + "minus;": u"\u2212", + "minusb;": u"\u229f", + "minusd;": u"\u2238", + "minusdu;": u"\u2a2a", + "mlcp;": u"\u2adb", + "mldr;": u"\u2026", + "mnplus;": u"\u2213", + "models;": u"\u22a7", + "mopf;": u"\U0001d55e", + "mp;": u"\u2213", + "mscr;": u"\U0001d4c2", + "mstpos;": u"\u223e", + "mu;": u"\u03bc", + "multimap;": u"\u22b8", + "mumap;": u"\u22b8", + "nGg;": u"\u22d9\u0338", + "nGt;": u"\u226b\u20d2", + "nGtv;": u"\u226b\u0338", + "nLeftarrow;": u"\u21cd", + "nLeftrightarrow;": u"\u21ce", + "nLl;": u"\u22d8\u0338", + "nLt;": u"\u226a\u20d2", + "nLtv;": u"\u226a\u0338", + "nRightarrow;": u"\u21cf", + "nVDash;": u"\u22af", + "nVdash;": u"\u22ae", + "nabla;": u"\u2207", + "nacute;": u"\u0144", + "nang;": u"\u2220\u20d2", + "nap;": u"\u2249", + "napE;": u"\u2a70\u0338", + "napid;": u"\u224b\u0338", + "napos;": u"\u0149", + "napprox;": u"\u2249", + "natur;": u"\u266e", + "natural;": u"\u266e", + "naturals;": u"\u2115", + "nbsp": u"\xa0", + "nbsp;": u"\xa0", + "nbump;": u"\u224e\u0338", + "nbumpe;": u"\u224f\u0338", + "ncap;": u"\u2a43", + "ncaron;": u"\u0148", + "ncedil;": u"\u0146", + "ncong;": u"\u2247", + "ncongdot;": u"\u2a6d\u0338", + "ncup;": u"\u2a42", + "ncy;": u"\u043d", + "ndash;": u"\u2013", + "ne;": u"\u2260", + "neArr;": u"\u21d7", + "nearhk;": u"\u2924", + "nearr;": u"\u2197", + "nearrow;": u"\u2197", + "nedot;": u"\u2250\u0338", + "nequiv;": u"\u2262", + "nesear;": u"\u2928", + "nesim;": u"\u2242\u0338", + "nexist;": u"\u2204", + "nexists;": u"\u2204", + "nfr;": u"\U0001d52b", + "ngE;": u"\u2267\u0338", + "nge;": u"\u2271", + "ngeq;": u"\u2271", + "ngeqq;": u"\u2267\u0338", + "ngeqslant;": u"\u2a7e\u0338", + "nges;": u"\u2a7e\u0338", + "ngsim;": u"\u2275", + "ngt;": u"\u226f", + "ngtr;": u"\u226f", + "nhArr;": u"\u21ce", + "nharr;": u"\u21ae", + "nhpar;": u"\u2af2", + "ni;": u"\u220b", + "nis;": u"\u22fc", + "nisd;": u"\u22fa", + "niv;": u"\u220b", + "njcy;": u"\u045a", + "nlArr;": u"\u21cd", + "nlE;": u"\u2266\u0338", + "nlarr;": u"\u219a", + "nldr;": u"\u2025", + "nle;": u"\u2270", + "nleftarrow;": u"\u219a", + "nleftrightarrow;": u"\u21ae", + "nleq;": u"\u2270", + "nleqq;": u"\u2266\u0338", + "nleqslant;": u"\u2a7d\u0338", + "nles;": u"\u2a7d\u0338", + "nless;": u"\u226e", + "nlsim;": u"\u2274", + "nlt;": u"\u226e", + "nltri;": u"\u22ea", + "nltrie;": u"\u22ec", + "nmid;": u"\u2224", + "nopf;": u"\U0001d55f", + "not": u"\xac", + "not;": u"\xac", + "notin;": u"\u2209", + "notinE;": u"\u22f9\u0338", + "notindot;": u"\u22f5\u0338", + "notinva;": u"\u2209", + "notinvb;": u"\u22f7", + "notinvc;": u"\u22f6", + "notni;": u"\u220c", + "notniva;": u"\u220c", + "notnivb;": u"\u22fe", + "notnivc;": u"\u22fd", + "npar;": u"\u2226", + "nparallel;": u"\u2226", + "nparsl;": u"\u2afd\u20e5", + "npart;": u"\u2202\u0338", + "npolint;": u"\u2a14", + "npr;": u"\u2280", + "nprcue;": u"\u22e0", + "npre;": u"\u2aaf\u0338", + "nprec;": u"\u2280", + "npreceq;": u"\u2aaf\u0338", + "nrArr;": u"\u21cf", + "nrarr;": u"\u219b", + "nrarrc;": u"\u2933\u0338", + "nrarrw;": u"\u219d\u0338", + "nrightarrow;": u"\u219b", + "nrtri;": u"\u22eb", + "nrtrie;": u"\u22ed", + "nsc;": u"\u2281", + "nsccue;": u"\u22e1", + "nsce;": u"\u2ab0\u0338", + "nscr;": u"\U0001d4c3", + "nshortmid;": u"\u2224", + "nshortparallel;": u"\u2226", + "nsim;": u"\u2241", + "nsime;": u"\u2244", + "nsimeq;": u"\u2244", + "nsmid;": u"\u2224", + "nspar;": u"\u2226", + "nsqsube;": u"\u22e2", + "nsqsupe;": u"\u22e3", + "nsub;": u"\u2284", + "nsubE;": u"\u2ac5\u0338", + "nsube;": u"\u2288", + "nsubset;": u"\u2282\u20d2", + "nsubseteq;": u"\u2288", + "nsubseteqq;": u"\u2ac5\u0338", + "nsucc;": u"\u2281", + "nsucceq;": u"\u2ab0\u0338", + "nsup;": u"\u2285", + "nsupE;": u"\u2ac6\u0338", + "nsupe;": u"\u2289", + "nsupset;": u"\u2283\u20d2", + "nsupseteq;": u"\u2289", + "nsupseteqq;": u"\u2ac6\u0338", + "ntgl;": u"\u2279", + "ntilde": u"\xf1", + "ntilde;": u"\xf1", + "ntlg;": u"\u2278", + "ntriangleleft;": u"\u22ea", + "ntrianglelefteq;": u"\u22ec", + "ntriangleright;": u"\u22eb", + "ntrianglerighteq;": u"\u22ed", + "nu;": u"\u03bd", + "num;": u"#", + "numero;": u"\u2116", + "numsp;": u"\u2007", + "nvDash;": u"\u22ad", + "nvHarr;": u"\u2904", + "nvap;": u"\u224d\u20d2", + "nvdash;": u"\u22ac", + "nvge;": u"\u2265\u20d2", + "nvgt;": u">\u20d2", + "nvinfin;": u"\u29de", + "nvlArr;": u"\u2902", + "nvle;": u"\u2264\u20d2", + "nvlt;": u"<\u20d2", + "nvltrie;": u"\u22b4\u20d2", + "nvrArr;": u"\u2903", + "nvrtrie;": u"\u22b5\u20d2", + "nvsim;": u"\u223c\u20d2", + "nwArr;": u"\u21d6", + "nwarhk;": u"\u2923", + "nwarr;": u"\u2196", + "nwarrow;": u"\u2196", + "nwnear;": u"\u2927", + "oS;": u"\u24c8", + "oacute": u"\xf3", + "oacute;": u"\xf3", + "oast;": u"\u229b", + "ocir;": u"\u229a", + "ocirc": u"\xf4", + "ocirc;": u"\xf4", + "ocy;": u"\u043e", + "odash;": u"\u229d", + "odblac;": u"\u0151", + "odiv;": u"\u2a38", + "odot;": u"\u2299", + "odsold;": u"\u29bc", + "oelig;": u"\u0153", + "ofcir;": u"\u29bf", + "ofr;": u"\U0001d52c", + "ogon;": u"\u02db", + "ograve": u"\xf2", + "ograve;": u"\xf2", + "ogt;": u"\u29c1", + "ohbar;": u"\u29b5", + "ohm;": u"\u03a9", + "oint;": u"\u222e", + "olarr;": u"\u21ba", + "olcir;": u"\u29be", + "olcross;": u"\u29bb", + "oline;": u"\u203e", + "olt;": u"\u29c0", + "omacr;": u"\u014d", + "omega;": u"\u03c9", + "omicron;": u"\u03bf", + "omid;": u"\u29b6", + "ominus;": u"\u2296", + "oopf;": u"\U0001d560", + "opar;": u"\u29b7", + "operp;": u"\u29b9", + "oplus;": u"\u2295", + "or;": u"\u2228", + "orarr;": u"\u21bb", + "ord;": u"\u2a5d", + "order;": u"\u2134", + "orderof;": u"\u2134", + "ordf": u"\xaa", + "ordf;": u"\xaa", + "ordm": u"\xba", + "ordm;": u"\xba", + "origof;": u"\u22b6", + "oror;": u"\u2a56", + "orslope;": u"\u2a57", + "orv;": u"\u2a5b", + "oscr;": u"\u2134", + "oslash": u"\xf8", + "oslash;": u"\xf8", + "osol;": u"\u2298", + "otilde": u"\xf5", + "otilde;": u"\xf5", + "otimes;": u"\u2297", + "otimesas;": u"\u2a36", + "ouml": u"\xf6", + "ouml;": u"\xf6", + "ovbar;": u"\u233d", + "par;": u"\u2225", + "para": u"\xb6", + "para;": u"\xb6", + "parallel;": u"\u2225", + "parsim;": u"\u2af3", + "parsl;": u"\u2afd", + "part;": u"\u2202", + "pcy;": u"\u043f", + "percnt;": u"%", + "period;": u".", + "permil;": u"\u2030", + "perp;": u"\u22a5", + "pertenk;": u"\u2031", + "pfr;": u"\U0001d52d", + "phi;": u"\u03c6", + "phiv;": u"\u03d5", + "phmmat;": u"\u2133", + "phone;": u"\u260e", + "pi;": u"\u03c0", + "pitchfork;": u"\u22d4", + "piv;": u"\u03d6", + "planck;": u"\u210f", + "planckh;": u"\u210e", + "plankv;": u"\u210f", + "plus;": u"+", + "plusacir;": u"\u2a23", + "plusb;": u"\u229e", + "pluscir;": u"\u2a22", + "plusdo;": u"\u2214", + "plusdu;": u"\u2a25", + "pluse;": u"\u2a72", + "plusmn": u"\xb1", + "plusmn;": u"\xb1", + "plussim;": u"\u2a26", + "plustwo;": u"\u2a27", + "pm;": u"\xb1", + "pointint;": u"\u2a15", + "popf;": u"\U0001d561", + "pound": u"\xa3", + "pound;": u"\xa3", + "pr;": u"\u227a", + "prE;": u"\u2ab3", + "prap;": u"\u2ab7", + "prcue;": u"\u227c", + "pre;": u"\u2aaf", + "prec;": u"\u227a", + "precapprox;": u"\u2ab7", + "preccurlyeq;": u"\u227c", + "preceq;": u"\u2aaf", + "precnapprox;": u"\u2ab9", + "precneqq;": u"\u2ab5", + "precnsim;": u"\u22e8", + "precsim;": u"\u227e", + "prime;": u"\u2032", + "primes;": u"\u2119", + "prnE;": u"\u2ab5", + "prnap;": u"\u2ab9", + "prnsim;": u"\u22e8", + "prod;": u"\u220f", + "profalar;": u"\u232e", + "profline;": u"\u2312", + "profsurf;": u"\u2313", + "prop;": u"\u221d", + "propto;": u"\u221d", + "prsim;": u"\u227e", + "prurel;": u"\u22b0", + "pscr;": u"\U0001d4c5", + "psi;": u"\u03c8", + "puncsp;": u"\u2008", + "qfr;": u"\U0001d52e", + "qint;": u"\u2a0c", + "qopf;": u"\U0001d562", + "qprime;": u"\u2057", + "qscr;": u"\U0001d4c6", + "quaternions;": u"\u210d", + "quatint;": u"\u2a16", + "quest;": u"?", + "questeq;": u"\u225f", + "quot": u"\"", + "quot;": u"\"", + "rAarr;": u"\u21db", + "rArr;": u"\u21d2", + "rAtail;": u"\u291c", + "rBarr;": u"\u290f", + "rHar;": u"\u2964", + "race;": u"\u223d\u0331", + "racute;": u"\u0155", + "radic;": u"\u221a", + "raemptyv;": u"\u29b3", + "rang;": u"\u27e9", + "rangd;": u"\u2992", + "range;": u"\u29a5", + "rangle;": u"\u27e9", + "raquo": u"\xbb", + "raquo;": u"\xbb", + "rarr;": u"\u2192", + "rarrap;": u"\u2975", + "rarrb;": u"\u21e5", + "rarrbfs;": u"\u2920", + "rarrc;": u"\u2933", + "rarrfs;": u"\u291e", + "rarrhk;": u"\u21aa", + "rarrlp;": u"\u21ac", + "rarrpl;": u"\u2945", + "rarrsim;": u"\u2974", + "rarrtl;": u"\u21a3", + "rarrw;": u"\u219d", + "ratail;": u"\u291a", + "ratio;": u"\u2236", + "rationals;": u"\u211a", + "rbarr;": u"\u290d", + "rbbrk;": u"\u2773", + "rbrace;": u"}", + "rbrack;": u"]", + "rbrke;": u"\u298c", + "rbrksld;": u"\u298e", + "rbrkslu;": u"\u2990", + "rcaron;": u"\u0159", + "rcedil;": u"\u0157", + "rceil;": u"\u2309", + "rcub;": u"}", + "rcy;": u"\u0440", + "rdca;": u"\u2937", + "rdldhar;": u"\u2969", + "rdquo;": u"\u201d", + "rdquor;": u"\u201d", + "rdsh;": u"\u21b3", + "real;": u"\u211c", + "realine;": u"\u211b", + "realpart;": u"\u211c", + "reals;": u"\u211d", + "rect;": u"\u25ad", + "reg": u"\xae", + "reg;": u"\xae", + "rfisht;": u"\u297d", + "rfloor;": u"\u230b", + "rfr;": u"\U0001d52f", + "rhard;": u"\u21c1", + "rharu;": u"\u21c0", + "rharul;": u"\u296c", + "rho;": u"\u03c1", + "rhov;": u"\u03f1", + "rightarrow;": u"\u2192", + "rightarrowtail;": u"\u21a3", + "rightharpoondown;": u"\u21c1", + "rightharpoonup;": u"\u21c0", + "rightleftarrows;": u"\u21c4", + "rightleftharpoons;": u"\u21cc", + "rightrightarrows;": u"\u21c9", + "rightsquigarrow;": u"\u219d", + "rightthreetimes;": u"\u22cc", + "ring;": u"\u02da", + "risingdotseq;": u"\u2253", + "rlarr;": u"\u21c4", + "rlhar;": u"\u21cc", + "rlm;": u"\u200f", + "rmoust;": u"\u23b1", + "rmoustache;": u"\u23b1", + "rnmid;": u"\u2aee", + "roang;": u"\u27ed", + "roarr;": u"\u21fe", + "robrk;": u"\u27e7", + "ropar;": u"\u2986", + "ropf;": u"\U0001d563", + "roplus;": u"\u2a2e", + "rotimes;": u"\u2a35", + "rpar;": u")", + "rpargt;": u"\u2994", + "rppolint;": u"\u2a12", + "rrarr;": u"\u21c9", + "rsaquo;": u"\u203a", + "rscr;": u"\U0001d4c7", + "rsh;": u"\u21b1", + "rsqb;": u"]", + "rsquo;": u"\u2019", + "rsquor;": u"\u2019", + "rthree;": u"\u22cc", + "rtimes;": u"\u22ca", + "rtri;": u"\u25b9", + "rtrie;": u"\u22b5", + "rtrif;": u"\u25b8", + "rtriltri;": u"\u29ce", + "ruluhar;": u"\u2968", + "rx;": u"\u211e", + "sacute;": u"\u015b", + "sbquo;": u"\u201a", + "sc;": u"\u227b", + "scE;": u"\u2ab4", + "scap;": u"\u2ab8", + "scaron;": u"\u0161", + "sccue;": u"\u227d", + "sce;": u"\u2ab0", + "scedil;": u"\u015f", + "scirc;": u"\u015d", + "scnE;": u"\u2ab6", + "scnap;": u"\u2aba", + "scnsim;": u"\u22e9", + "scpolint;": u"\u2a13", + "scsim;": u"\u227f", + "scy;": u"\u0441", + "sdot;": u"\u22c5", + "sdotb;": u"\u22a1", + "sdote;": u"\u2a66", + "seArr;": u"\u21d8", + "searhk;": u"\u2925", + "searr;": u"\u2198", + "searrow;": u"\u2198", + "sect": u"\xa7", + "sect;": u"\xa7", + "semi;": u";", + "seswar;": u"\u2929", + "setminus;": u"\u2216", + "setmn;": u"\u2216", + "sext;": u"\u2736", + "sfr;": u"\U0001d530", + "sfrown;": u"\u2322", + "sharp;": u"\u266f", + "shchcy;": u"\u0449", + "shcy;": u"\u0448", + "shortmid;": u"\u2223", + "shortparallel;": u"\u2225", + "shy": u"\xad", + "shy;": u"\xad", + "sigma;": u"\u03c3", + "sigmaf;": u"\u03c2", + "sigmav;": u"\u03c2", + "sim;": u"\u223c", + "simdot;": u"\u2a6a", + "sime;": u"\u2243", + "simeq;": u"\u2243", + "simg;": u"\u2a9e", + "simgE;": u"\u2aa0", + "siml;": u"\u2a9d", + "simlE;": u"\u2a9f", + "simne;": u"\u2246", + "simplus;": u"\u2a24", + "simrarr;": u"\u2972", + "slarr;": u"\u2190", + "smallsetminus;": u"\u2216", + "smashp;": u"\u2a33", + "smeparsl;": u"\u29e4", + "smid;": u"\u2223", + "smile;": u"\u2323", + "smt;": u"\u2aaa", + "smte;": u"\u2aac", + "smtes;": u"\u2aac\ufe00", + "softcy;": u"\u044c", + "sol;": u"/", + "solb;": u"\u29c4", + "solbar;": u"\u233f", + "sopf;": u"\U0001d564", + "spades;": u"\u2660", + "spadesuit;": u"\u2660", + "spar;": u"\u2225", + "sqcap;": u"\u2293", + "sqcaps;": u"\u2293\ufe00", + "sqcup;": u"\u2294", + "sqcups;": u"\u2294\ufe00", + "sqsub;": u"\u228f", + "sqsube;": u"\u2291", + "sqsubset;": u"\u228f", + "sqsubseteq;": u"\u2291", + "sqsup;": u"\u2290", + "sqsupe;": u"\u2292", + "sqsupset;": u"\u2290", + "sqsupseteq;": u"\u2292", + "squ;": u"\u25a1", + "square;": u"\u25a1", + "squarf;": u"\u25aa", + "squf;": u"\u25aa", + "srarr;": u"\u2192", + "sscr;": u"\U0001d4c8", + "ssetmn;": u"\u2216", + "ssmile;": u"\u2323", + "sstarf;": u"\u22c6", + "star;": u"\u2606", + "starf;": u"\u2605", + "straightepsilon;": u"\u03f5", + "straightphi;": u"\u03d5", + "strns;": u"\xaf", + "sub;": u"\u2282", + "subE;": u"\u2ac5", + "subdot;": u"\u2abd", + "sube;": u"\u2286", + "subedot;": u"\u2ac3", + "submult;": u"\u2ac1", + "subnE;": u"\u2acb", + "subne;": u"\u228a", + "subplus;": u"\u2abf", + "subrarr;": u"\u2979", + "subset;": u"\u2282", + "subseteq;": u"\u2286", + "subseteqq;": u"\u2ac5", + "subsetneq;": u"\u228a", + "subsetneqq;": u"\u2acb", + "subsim;": u"\u2ac7", + "subsub;": u"\u2ad5", + "subsup;": u"\u2ad3", + "succ;": u"\u227b", + "succapprox;": u"\u2ab8", + "succcurlyeq;": u"\u227d", + "succeq;": u"\u2ab0", + "succnapprox;": u"\u2aba", + "succneqq;": u"\u2ab6", + "succnsim;": u"\u22e9", + "succsim;": u"\u227f", + "sum;": u"\u2211", + "sung;": u"\u266a", + "sup1": u"\xb9", + "sup1;": u"\xb9", + "sup2": u"\xb2", + "sup2;": u"\xb2", + "sup3": u"\xb3", + "sup3;": u"\xb3", + "sup;": u"\u2283", + "supE;": u"\u2ac6", + "supdot;": u"\u2abe", + "supdsub;": u"\u2ad8", + "supe;": u"\u2287", + "supedot;": u"\u2ac4", + "suphsol;": u"\u27c9", + "suphsub;": u"\u2ad7", + "suplarr;": u"\u297b", + "supmult;": u"\u2ac2", + "supnE;": u"\u2acc", + "supne;": u"\u228b", + "supplus;": u"\u2ac0", + "supset;": u"\u2283", + "supseteq;": u"\u2287", + "supseteqq;": u"\u2ac6", + "supsetneq;": u"\u228b", + "supsetneqq;": u"\u2acc", + "supsim;": u"\u2ac8", + "supsub;": u"\u2ad4", + "supsup;": u"\u2ad6", + "swArr;": u"\u21d9", + "swarhk;": u"\u2926", + "swarr;": u"\u2199", + "swarrow;": u"\u2199", + "swnwar;": u"\u292a", + "szlig": u"\xdf", + "szlig;": u"\xdf", + "target;": u"\u2316", + "tau;": u"\u03c4", + "tbrk;": u"\u23b4", + "tcaron;": u"\u0165", + "tcedil;": u"\u0163", + "tcy;": u"\u0442", + "tdot;": u"\u20db", + "telrec;": u"\u2315", + "tfr;": u"\U0001d531", + "there4;": u"\u2234", + "therefore;": u"\u2234", + "theta;": u"\u03b8", + "thetasym;": u"\u03d1", + "thetav;": u"\u03d1", + "thickapprox;": u"\u2248", + "thicksim;": u"\u223c", + "thinsp;": u"\u2009", + "thkap;": u"\u2248", + "thksim;": u"\u223c", + "thorn": u"\xfe", + "thorn;": u"\xfe", + "tilde;": u"\u02dc", + "times": u"\xd7", + "times;": u"\xd7", + "timesb;": u"\u22a0", + "timesbar;": u"\u2a31", + "timesd;": u"\u2a30", + "tint;": u"\u222d", + "toea;": u"\u2928", + "top;": u"\u22a4", + "topbot;": u"\u2336", + "topcir;": u"\u2af1", + "topf;": u"\U0001d565", + "topfork;": u"\u2ada", + "tosa;": u"\u2929", + "tprime;": u"\u2034", + "trade;": u"\u2122", + "triangle;": u"\u25b5", + "triangledown;": u"\u25bf", + "triangleleft;": u"\u25c3", + "trianglelefteq;": u"\u22b4", + "triangleq;": u"\u225c", + "triangleright;": u"\u25b9", + "trianglerighteq;": u"\u22b5", + "tridot;": u"\u25ec", + "trie;": u"\u225c", + "triminus;": u"\u2a3a", + "triplus;": u"\u2a39", + "trisb;": u"\u29cd", + "tritime;": u"\u2a3b", + "trpezium;": u"\u23e2", + "tscr;": u"\U0001d4c9", + "tscy;": u"\u0446", + "tshcy;": u"\u045b", + "tstrok;": u"\u0167", + "twixt;": u"\u226c", + "twoheadleftarrow;": u"\u219e", + "twoheadrightarrow;": u"\u21a0", + "uArr;": u"\u21d1", + "uHar;": u"\u2963", + "uacute": u"\xfa", + "uacute;": u"\xfa", + "uarr;": u"\u2191", + "ubrcy;": u"\u045e", + "ubreve;": u"\u016d", + "ucirc": u"\xfb", + "ucirc;": u"\xfb", + "ucy;": u"\u0443", + "udarr;": u"\u21c5", + "udblac;": u"\u0171", + "udhar;": u"\u296e", + "ufisht;": u"\u297e", + "ufr;": u"\U0001d532", + "ugrave": u"\xf9", + "ugrave;": u"\xf9", + "uharl;": u"\u21bf", + "uharr;": u"\u21be", + "uhblk;": u"\u2580", + "ulcorn;": u"\u231c", + "ulcorner;": u"\u231c", + "ulcrop;": u"\u230f", + "ultri;": u"\u25f8", + "umacr;": u"\u016b", + "uml": u"\xa8", + "uml;": u"\xa8", + "uogon;": u"\u0173", + "uopf;": u"\U0001d566", + "uparrow;": u"\u2191", + "updownarrow;": u"\u2195", + "upharpoonleft;": u"\u21bf", + "upharpoonright;": u"\u21be", + "uplus;": u"\u228e", + "upsi;": u"\u03c5", + "upsih;": u"\u03d2", + "upsilon;": u"\u03c5", + "upuparrows;": u"\u21c8", + "urcorn;": u"\u231d", + "urcorner;": u"\u231d", + "urcrop;": u"\u230e", + "uring;": u"\u016f", + "urtri;": u"\u25f9", + "uscr;": u"\U0001d4ca", + "utdot;": u"\u22f0", + "utilde;": u"\u0169", + "utri;": u"\u25b5", + "utrif;": u"\u25b4", + "uuarr;": u"\u21c8", + "uuml": u"\xfc", + "uuml;": u"\xfc", + "uwangle;": u"\u29a7", + "vArr;": u"\u21d5", + "vBar;": u"\u2ae8", + "vBarv;": u"\u2ae9", + "vDash;": u"\u22a8", + "vangrt;": u"\u299c", + "varepsilon;": u"\u03f5", + "varkappa;": u"\u03f0", + "varnothing;": u"\u2205", + "varphi;": u"\u03d5", + "varpi;": u"\u03d6", + "varpropto;": u"\u221d", + "varr;": u"\u2195", + "varrho;": u"\u03f1", + "varsigma;": u"\u03c2", + "varsubsetneq;": u"\u228a\ufe00", + "varsubsetneqq;": u"\u2acb\ufe00", + "varsupsetneq;": u"\u228b\ufe00", + "varsupsetneqq;": u"\u2acc\ufe00", + "vartheta;": u"\u03d1", + "vartriangleleft;": u"\u22b2", + "vartriangleright;": u"\u22b3", + "vcy;": u"\u0432", + "vdash;": u"\u22a2", + "vee;": u"\u2228", + "veebar;": u"\u22bb", + "veeeq;": u"\u225a", + "vellip;": u"\u22ee", + "verbar;": u"|", + "vert;": u"|", + "vfr;": u"\U0001d533", + "vltri;": u"\u22b2", + "vnsub;": u"\u2282\u20d2", + "vnsup;": u"\u2283\u20d2", + "vopf;": u"\U0001d567", + "vprop;": u"\u221d", + "vrtri;": u"\u22b3", + "vscr;": u"\U0001d4cb", + "vsubnE;": u"\u2acb\ufe00", + "vsubne;": u"\u228a\ufe00", + "vsupnE;": u"\u2acc\ufe00", + "vsupne;": u"\u228b\ufe00", + "vzigzag;": u"\u299a", + "wcirc;": u"\u0175", + "wedbar;": u"\u2a5f", + "wedge;": u"\u2227", + "wedgeq;": u"\u2259", + "weierp;": u"\u2118", + "wfr;": u"\U0001d534", + "wopf;": u"\U0001d568", + "wp;": u"\u2118", + "wr;": u"\u2240", + "wreath;": u"\u2240", + "wscr;": u"\U0001d4cc", + "xcap;": u"\u22c2", + "xcirc;": u"\u25ef", + "xcup;": u"\u22c3", + "xdtri;": u"\u25bd", + "xfr;": u"\U0001d535", + "xhArr;": u"\u27fa", + "xharr;": u"\u27f7", + "xi;": u"\u03be", + "xlArr;": u"\u27f8", + "xlarr;": u"\u27f5", + "xmap;": u"\u27fc", + "xnis;": u"\u22fb", + "xodot;": u"\u2a00", + "xopf;": u"\U0001d569", + "xoplus;": u"\u2a01", + "xotime;": u"\u2a02", + "xrArr;": u"\u27f9", + "xrarr;": u"\u27f6", + "xscr;": u"\U0001d4cd", + "xsqcup;": u"\u2a06", + "xuplus;": u"\u2a04", + "xutri;": u"\u25b3", + "xvee;": u"\u22c1", + "xwedge;": u"\u22c0", + "yacute": u"\xfd", + "yacute;": u"\xfd", + "yacy;": u"\u044f", + "ycirc;": u"\u0177", + "ycy;": u"\u044b", + "yen": u"\xa5", + "yen;": u"\xa5", + "yfr;": u"\U0001d536", + "yicy;": u"\u0457", + "yopf;": u"\U0001d56a", + "yscr;": u"\U0001d4ce", + "yucy;": u"\u044e", + "yuml": u"\xff", + "yuml;": u"\xff", + "zacute;": u"\u017a", + "zcaron;": u"\u017e", + "zcy;": u"\u0437", + "zdot;": u"\u017c", + "zeetrf;": u"\u2128", + "zeta;": u"\u03b6", + "zfr;": u"\U0001d537", + "zhcy;": u"\u0436", + "zigrarr;": u"\u21dd", + "zopf;": u"\U0001d56b", + "zscr;": u"\U0001d4cf", + "zwj;": u"\u200d", + "zwnj;": u"\u200c", +} + +replacementCharacters = { + 0x0:u"\uFFFD", + 0x0d:u"\u000D", + 0x80:u"\u20AC", + 0x81:u"\u0081", + 0x81:u"\u0081", + 0x82:u"\u201A", + 0x83:u"\u0192", + 0x84:u"\u201E", + 0x85:u"\u2026", + 0x86:u"\u2020", + 0x87:u"\u2021", + 0x88:u"\u02C6", + 0x89:u"\u2030", + 0x8A:u"\u0160", + 0x8B:u"\u2039", + 0x8C:u"\u0152", + 0x8D:u"\u008D", + 0x8E:u"\u017D", + 0x8F:u"\u008F", + 0x90:u"\u0090", + 0x91:u"\u2018", + 0x92:u"\u2019", + 0x93:u"\u201C", + 0x94:u"\u201D", + 0x95:u"\u2022", + 0x96:u"\u2013", + 0x97:u"\u2014", + 0x98:u"\u02DC", + 0x99:u"\u2122", + 0x9A:u"\u0161", + 0x9B:u"\u203A", + 0x9C:u"\u0153", + 0x9D:u"\u009D", + 0x9E:u"\u017E", + 0x9F:u"\u0178", +} + +encodings = { + '437': 'cp437', + '850': 'cp850', + '852': 'cp852', + '855': 'cp855', + '857': 'cp857', + '860': 'cp860', + '861': 'cp861', + '862': 'cp862', + '863': 'cp863', + '865': 'cp865', + '866': 'cp866', + '869': 'cp869', + 'ansix341968': 'ascii', + 'ansix341986': 'ascii', + 'arabic': 'iso8859-6', + 'ascii': 'ascii', + 'asmo708': 'iso8859-6', + 'big5': 'big5', + 'big5hkscs': 'big5hkscs', + 'chinese': 'gbk', + 'cp037': 'cp037', + 'cp1026': 'cp1026', + 'cp154': 'ptcp154', + 'cp367': 'ascii', + 'cp424': 'cp424', + 'cp437': 'cp437', + 'cp500': 'cp500', + 'cp775': 'cp775', + 'cp819': 'windows-1252', + 'cp850': 'cp850', + 'cp852': 'cp852', + 'cp855': 'cp855', + 'cp857': 'cp857', + 'cp860': 'cp860', + 'cp861': 'cp861', + 'cp862': 'cp862', + 'cp863': 'cp863', + 'cp864': 'cp864', + 'cp865': 'cp865', + 'cp866': 'cp866', + 'cp869': 'cp869', + 'cp936': 'gbk', + 'cpgr': 'cp869', + 'cpis': 'cp861', + 'csascii': 'ascii', + 'csbig5': 'big5', + 'cseuckr': 'cp949', + 'cseucpkdfmtjapanese': 'euc_jp', + 'csgb2312': 'gbk', + 'cshproman8': 'hp-roman8', + 'csibm037': 'cp037', + 'csibm1026': 'cp1026', + 'csibm424': 'cp424', + 'csibm500': 'cp500', + 'csibm855': 'cp855', + 'csibm857': 'cp857', + 'csibm860': 'cp860', + 'csibm861': 'cp861', + 'csibm863': 'cp863', + 'csibm864': 'cp864', + 'csibm865': 'cp865', + 'csibm866': 'cp866', + 'csibm869': 'cp869', + 'csiso2022jp': 'iso2022_jp', + 'csiso2022jp2': 'iso2022_jp_2', + 'csiso2022kr': 'iso2022_kr', + 'csiso58gb231280': 'gbk', + 'csisolatin1': 'windows-1252', + 'csisolatin2': 'iso8859-2', + 'csisolatin3': 'iso8859-3', + 'csisolatin4': 'iso8859-4', + 'csisolatin5': 'windows-1254', + 'csisolatin6': 'iso8859-10', + 'csisolatinarabic': 'iso8859-6', + 'csisolatincyrillic': 'iso8859-5', + 'csisolatingreek': 'iso8859-7', + 'csisolatinhebrew': 'iso8859-8', + 'cskoi8r': 'koi8-r', + 'csksc56011987': 'cp949', + 'cspc775baltic': 'cp775', + 'cspc850multilingual': 'cp850', + 'cspc862latinhebrew': 'cp862', + 'cspc8codepage437': 'cp437', + 'cspcp852': 'cp852', + 'csptcp154': 'ptcp154', + 'csshiftjis': 'shift_jis', + 'csunicode11utf7': 'utf-7', + 'cyrillic': 'iso8859-5', + 'cyrillicasian': 'ptcp154', + 'ebcdiccpbe': 'cp500', + 'ebcdiccpca': 'cp037', + 'ebcdiccpch': 'cp500', + 'ebcdiccphe': 'cp424', + 'ebcdiccpnl': 'cp037', + 'ebcdiccpus': 'cp037', + 'ebcdiccpwt': 'cp037', + 'ecma114': 'iso8859-6', + 'ecma118': 'iso8859-7', + 'elot928': 'iso8859-7', + 'eucjp': 'euc_jp', + 'euckr': 'cp949', + 'extendedunixcodepackedformatforjapanese': 'euc_jp', + 'gb18030': 'gb18030', + 'gb2312': 'gbk', + 'gb231280': 'gbk', + 'gbk': 'gbk', + 'greek': 'iso8859-7', + 'greek8': 'iso8859-7', + 'hebrew': 'iso8859-8', + 'hproman8': 'hp-roman8', + 'hzgb2312': 'hz', + 'ibm037': 'cp037', + 'ibm1026': 'cp1026', + 'ibm367': 'ascii', + 'ibm424': 'cp424', + 'ibm437': 'cp437', + 'ibm500': 'cp500', + 'ibm775': 'cp775', + 'ibm819': 'windows-1252', + 'ibm850': 'cp850', + 'ibm852': 'cp852', + 'ibm855': 'cp855', + 'ibm857': 'cp857', + 'ibm860': 'cp860', + 'ibm861': 'cp861', + 'ibm862': 'cp862', + 'ibm863': 'cp863', + 'ibm864': 'cp864', + 'ibm865': 'cp865', + 'ibm866': 'cp866', + 'ibm869': 'cp869', + 'iso2022jp': 'iso2022_jp', + 'iso2022jp2': 'iso2022_jp_2', + 'iso2022kr': 'iso2022_kr', + 'iso646irv1991': 'ascii', + 'iso646us': 'ascii', + 'iso88591': 'windows-1252', + 'iso885910': 'iso8859-10', + 'iso8859101992': 'iso8859-10', + 'iso885911987': 'windows-1252', + 'iso885913': 'iso8859-13', + 'iso885914': 'iso8859-14', + 'iso8859141998': 'iso8859-14', + 'iso885915': 'iso8859-15', + 'iso885916': 'iso8859-16', + 'iso8859162001': 'iso8859-16', + 'iso88592': 'iso8859-2', + 'iso885921987': 'iso8859-2', + 'iso88593': 'iso8859-3', + 'iso885931988': 'iso8859-3', + 'iso88594': 'iso8859-4', + 'iso885941988': 'iso8859-4', + 'iso88595': 'iso8859-5', + 'iso885951988': 'iso8859-5', + 'iso88596': 'iso8859-6', + 'iso885961987': 'iso8859-6', + 'iso88597': 'iso8859-7', + 'iso885971987': 'iso8859-7', + 'iso88598': 'iso8859-8', + 'iso885981988': 'iso8859-8', + 'iso88599': 'windows-1254', + 'iso885991989': 'windows-1254', + 'isoceltic': 'iso8859-14', + 'isoir100': 'windows-1252', + 'isoir101': 'iso8859-2', + 'isoir109': 'iso8859-3', + 'isoir110': 'iso8859-4', + 'isoir126': 'iso8859-7', + 'isoir127': 'iso8859-6', + 'isoir138': 'iso8859-8', + 'isoir144': 'iso8859-5', + 'isoir148': 'windows-1254', + 'isoir149': 'cp949', + 'isoir157': 'iso8859-10', + 'isoir199': 'iso8859-14', + 'isoir226': 'iso8859-16', + 'isoir58': 'gbk', + 'isoir6': 'ascii', + 'koi8r': 'koi8-r', + 'koi8u': 'koi8-u', + 'korean': 'cp949', + 'ksc5601': 'cp949', + 'ksc56011987': 'cp949', + 'ksc56011989': 'cp949', + 'l1': 'windows-1252', + 'l10': 'iso8859-16', + 'l2': 'iso8859-2', + 'l3': 'iso8859-3', + 'l4': 'iso8859-4', + 'l5': 'windows-1254', + 'l6': 'iso8859-10', + 'l8': 'iso8859-14', + 'latin1': 'windows-1252', + 'latin10': 'iso8859-16', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'windows-1254', + 'latin6': 'iso8859-10', + 'latin8': 'iso8859-14', + 'latin9': 'iso8859-15', + 'ms936': 'gbk', + 'mskanji': 'shift_jis', + 'pt154': 'ptcp154', + 'ptcp154': 'ptcp154', + 'r8': 'hp-roman8', + 'roman8': 'hp-roman8', + 'shiftjis': 'shift_jis', + 'tis620': 'cp874', + 'unicode11utf7': 'utf-7', + 'us': 'ascii', + 'usascii': 'ascii', + 'utf16': 'utf-16', + 'utf16be': 'utf-16-be', + 'utf16le': 'utf-16-le', + 'utf8': 'utf-8', + 'windows1250': 'cp1250', + 'windows1251': 'cp1251', + 'windows1252': 'cp1252', + 'windows1253': 'cp1253', + 'windows1254': 'cp1254', + 'windows1255': 'cp1255', + 'windows1256': 'cp1256', + 'windows1257': 'cp1257', + 'windows1258': 'cp1258', + 'windows936': 'gbk', + 'x-x-big5': 'big5'} + +tokenTypes = { + "Doctype":0, + "Characters":1, + "SpaceCharacters":2, + "StartTag":3, + "EndTag":4, + "EmptyTag":5, + "Comment":6, + "ParseError":7 +} + +tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"])) + + +prefixes = dict([(v,k) for k,v in namespaces.iteritems()]) +prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + +class DataLossWarning(UserWarning): + pass + +class ReparseException(Exception): + pass diff --git a/html5lib/filters/__init__.py b/html5lib/filters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/html5lib/filters/_base.py b/html5lib/filters/_base.py new file mode 100644 index 00000000..bca94ada --- /dev/null +++ b/html5lib/filters/_base.py @@ -0,0 +1,10 @@ + +class Filter(object): + def __init__(self, source): + self.source = source + + def __iter__(self): + return iter(self.source) + + def __getattr__(self, name): + return getattr(self.source, name) diff --git a/html5lib/filters/formfiller.py b/html5lib/filters/formfiller.py new file mode 100644 index 00000000..94001714 --- /dev/null +++ b/html5lib/filters/formfiller.py @@ -0,0 +1,127 @@ +# +# The goal is to finally have a form filler where you pass data for +# each form, using the algorithm for "Seeding a form with initial values" +# See http://www.whatwg.org/specs/web-forms/current-work/#seeding +# + +import _base + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class SimpleFilter(_base.Filter): + def __init__(self, source, fieldStorage): + _base.Filter.__init__(self, source) + self.fieldStorage = fieldStorage + + def __iter__(self): + field_indices = {} + state = None + field_name = None + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"].lower() + if name == "input": + field_name = None + field_type = None + input_value_index = -1 + input_checked_index = -1 + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == u"name": + field_name = v.strip(spaceCharacters) + elif n == u"type": + field_type = v.strip(spaceCharacters) + elif n == u"checked": + input_checked_index = i + elif n == u"value": + input_value_index = i + + value_list = self.fieldStorage.getlist(field_name) + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + + if field_type in (u"checkbox", u"radio"): + if value_list: + if token["data"][input_value_index][1] == value: + if input_checked_index < 0: + token["data"].append((u"checked", u"")) + field_indices[field_name] = field_index + 1 + elif input_checked_index >= 0: + del token["data"][input_checked_index] + + elif field_type not in (u"button", u"submit", u"reset"): + if input_value_index >= 0: + token["data"][input_value_index] = (u"value", value) + else: + token["data"].append((u"value", value)) + field_indices[field_name] = field_index + 1 + + field_type = None + field_name = None + + elif name == "textarea": + field_type = "textarea" + field_name = dict((token["data"])[::-1])["name"] + + elif name == "select": + field_type = "select" + attributes = dict(token["data"][::-1]) + field_name = attributes.get("name") + is_select_multiple = "multiple" in attributes + is_selected_option_found = False + + elif field_type == "select" and field_name and name == "option": + option_selected_index = -1 + option_value = None + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == "selected": + option_selected_index = i + elif n == "value": + option_value = v.strip(spaceCharacters) + if option_value is None: + raise NotImplementedError("