mirror of
https://github.com/rembo10/headphones.git
synced 2026-03-21 12:19:27 +00:00
BeautifulSoup needs lxml or html5, have included html5lib. Also latest BeautifulSoup 4.1.3
53 lines
2.4 KiB
Python
53 lines
2.4 KiB
Python
"""A collection of modules for iterating through different kinds of
|
|
tree, generating tokens identical to those produced by the tokenizer
|
|
module.
|
|
|
|
To create a tree walker for a new type of tree, you need to do
|
|
implement a tree walker object (called TreeWalker by convention) that
|
|
implements a 'serialize' method taking a tree as sole argument and
|
|
returning an iterator generating tokens.
|
|
"""
|
|
|
|
treeWalkerCache = {}
|
|
|
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
|
"""Get a TreeWalker class for various types of tree with built-in support
|
|
|
|
treeType - the name of the tree type required (case-insensitive). Supported
|
|
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
|
|
|
"simpletree" - a built-in DOM-ish tree type with support for some
|
|
more pythonic idioms.
|
|
"dom" - The xml.dom.minidom DOM implementation
|
|
"pulldom" - The xml.dom.pulldom event stream
|
|
"etree" - A generic walker for tree implementations exposing an
|
|
elementtree-like interface (known to work with
|
|
ElementTree, cElementTree and lxml.etree).
|
|
"lxml" - Optimized walker for lxml.etree
|
|
"beautifulsoup" - Beautiful soup (if installed)
|
|
"genshi" - a Genshi stream
|
|
|
|
implementation - (Currently applies to the "etree" tree type only). A module
|
|
implementing the tree type e.g. xml.etree.ElementTree or
|
|
cElementTree."""
|
|
|
|
treeType = treeType.lower()
|
|
if treeType not in treeWalkerCache:
|
|
if treeType in ("dom", "pulldom", "simpletree"):
|
|
mod = __import__(treeType, globals())
|
|
treeWalkerCache[treeType] = mod.TreeWalker
|
|
elif treeType == "genshi":
|
|
import genshistream
|
|
treeWalkerCache[treeType] = genshistream.TreeWalker
|
|
elif treeType == "beautifulsoup":
|
|
import soup
|
|
treeWalkerCache[treeType] = soup.TreeWalker
|
|
elif treeType == "lxml":
|
|
import lxmletree
|
|
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
|
elif treeType == "etree":
|
|
import etree
|
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
|
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
|
return treeWalkerCache.get(treeType)
|