Include html5lib for BeautifulSoup

BeautifulSoup needs lxml or html5, have included html5lib. Also latest BeautifulSoup 4.1.3
2026-07-21 00:14:02 +01:00 · 2012-09-06 10:47:07 +12:00
parent 7aac60cecd
commit b8c5782765
136 changed files with 87265 additions and 2428 deletions
@@ -0,0 +1,180 @@
+import os
+import unittest
+from support import html5lib_test_files
+
+try:
+    import json
+except ImportError:
+    import simplejson as json
+
+import html5lib
+from html5lib import html5parser, serializer, constants
+from html5lib.treewalkers._base import TreeWalker
+
+optionals_loaded = []
+
+try:
+    from lxml import etree
+    optionals_loaded.append("lxml")
+except ImportError:
+    pass
+
+default_namespace = constants.namespaces["html"]
+
+class JsonWalker(TreeWalker):
+    def __iter__(self):
+        for token in self.tree:
+            type = token[0]
+            if type == "StartTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:4]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:3]
+                yield self.startTag(namespace, name, self._convertAttrib(attrib))
+            elif type == "EndTag":
+                if len(token) == 3:
+                    namespace, name = token[1:3]
+                else:
+                    namespace = default_namespace
+                    name = token[1]
+                yield self.endTag(namespace, name)
+            elif type == "EmptyTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:]
+                for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
+                    yield token
+            elif type == "Comment":
+                yield self.comment(token[1])
+            elif type in ("Characters", "SpaceCharacters"):
+                for token in self.text(token[1]):
+                    yield token
+            elif type == "Doctype":
+                if len(token) == 4:
+                    yield self.doctype(token[1], token[2], token[3])
+                elif len(token) == 3:
+                    yield self.doctype(token[1], token[2])
+                else:
+                    yield self.doctype(token[1])
+            else:
+                raise ValueError("Unknown token type: " + type)
+    
+    def _convertAttrib(self, attribs):
+        """html5lib tree-walkers use a dict of (namespace, name): value for
+        attributes, but JSON cannot represent this. Convert from the format
+        in the serializer tests (a list of dicts with "namespace", "name",
+        and "value" as keys) to html5lib's tree-walker format."""
+        attrs = {}
+        for attrib in attribs:
+            name = (attrib["namespace"], attrib["name"])
+            assert(name not in attrs)
+            attrs[name] = attrib["value"]
+        return attrs
+
+
+def serialize_html(input, options):
+    options = dict([(str(k),v) for k,v in options.iteritems()])
+    return serializer.HTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
+
+def serialize_xhtml(input, options):
+    options = dict([(str(k),v) for k,v in options.iteritems()])
+    return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
+
+def make_test(input, expected, xhtml, options):
+    result = serialize_html(input, options)
+    if len(expected) == 1:
+        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
+    elif result not in expected:
+        assert False, "Expected: %s, Received: %s" % (expected, result)
+
+    if not xhtml:
+        return
+
+    result = serialize_xhtml(input, options)
+    if len(xhtml) == 1:
+        assert xhtml[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:True\n%s"%(xhtml[0], result, str(options))
+    elif result not in xhtml:
+        assert False, "Expected: %s, Received: %s" % (xhtml, result)
+
+
+class EncodingTestCase(unittest.TestCase):
+    def throwsWithLatin1(self, input):
+        self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
+
+    def testDoctypeName(self):
+        self.throwsWithLatin1([["Doctype", u"\u0101"]])
+
+    def testDoctypePublicId(self):
+        self.throwsWithLatin1([["Doctype", u"potato", u"\u0101"]])
+
+    def testDoctypeSystemId(self):
+        self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
+
+    def testCdataCharacters(self):
+        self.assertEquals("<style>&amacr;", serialize_html([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}],
+                                                            ["Characters", u"\u0101"]],
+                                                           {"encoding": "iso-8859-1"}))
+
+    def testCharacters(self):
+        self.assertEquals("&amacr;", serialize_html([["Characters", u"\u0101"]],
+                                                    {"encoding": "iso-8859-1"}))
+
+    def testStartTagName(self):
+        self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
+
+    def testEmptyTagName(self):
+        self.throwsWithLatin1([["EmptyTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
+
+    def testAttributeName(self):
+        self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"span", [{"namespace": None, "name": u"\u0101", "value": u"potato"}]]])
+
+    def testAttributeValue(self):
+        self.assertEquals("<span potato=&amacr;>", serialize_html([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
+                                                                    [{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
+                                                                  {"encoding": "iso-8859-1"}))
+
+    def testEndTagName(self):
+        self.throwsWithLatin1([["EndTag", u"http://www.w3.org/1999/xhtml", u"\u0101"]])
+
+    def testComment(self):
+        self.throwsWithLatin1([["Comment", u"\u0101"]])
+
+
+if "lxml" in optionals_loaded:
+    class LxmlTestCase(unittest.TestCase):
+        def setUp(self):
+            self.parser = etree.XMLParser(resolve_entities=False)
+            self.treewalker = html5lib.getTreeWalker("lxml")
+            self.serializer = serializer.HTMLSerializer()
+
+        def testEntityReplacement(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
+            tree = etree.fromstring(doc, parser = self.parser).getroottree()
+            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
+            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
+
+        def testEntityXML(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
+            tree = etree.fromstring(doc, parser = self.parser).getroottree()
+            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
+            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
+
+        def testEntityNoResolve(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
+            tree = etree.fromstring(doc, parser = self.parser).getroottree()
+            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
+                                          resolve_entities=False)
+            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
+
+def test_serializer():
+    for filename in html5lib_test_files('serializer', '*.test'):
+        tests = json.load(file(filename))
+        test_name = os.path.basename(filename).replace('.test','')
+        for index, test in enumerate(tests['tests']):
+            xhtml = test.get("xhtml", test["expected"])
+            if test_name == 'optionaltags': 
+                xhtml = None
+            yield make_test, test["input"], test["expected"], xhtml, test.get("options", {})