Revert "Migrate as much as possible to pip and requirements.txt"

This reverts commit 982594a4a5.
2026-05-05 11:19:28 +01:00 · 2016-01-29 15:38:12 +00:00
parent 3c015990c5
commit 2110eb9855
1195 changed files with 196887 additions and 81 deletions
--- a/lib/html5lib/treewalkers/init.py
+++ b/lib/html5lib/treewalkers/init.py
@@ -0,0 +1,147 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
+           "pulldom"]
+
+import sys
+
+from .. import constants
+from ..utils import default_etree
+
+treeWalkerCache = {}
+
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are:
+
+                "dom" - The xml.dom.minidom DOM implementation
+                "pulldom" - The xml.dom.pulldom event stream
+                "etree" - A generic walker for tree implementations exposing an
+                          elementtree-like interface (known to work with
+                          ElementTree, cElementTree and lxml.etree).
+                "lxml" - Optimized walker for lxml.etree
+                "genshi" - a Genshi stream
+
+    implementation - (Currently applies to the "etree" tree type only). A module
+                      implementing the tree type e.g. xml.etree.ElementTree or
+                      cElementTree."""
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType in ("dom", "pulldom"):
+            name = "%s.%s" % (__name__, treeType)
+            __import__(name)
+            mod = sys.modules[name]
+            treeWalkerCache[treeType] = mod.TreeWalker
+        elif treeType == "genshi":
+            from . import genshistream
+            treeWalkerCache[treeType] = genshistream.TreeWalker
+        elif treeType == "lxml":
+            from . import lxmletree
+            treeWalkerCache[treeType] = lxmletree.TreeWalker
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers"""
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)