mirror of
https://github.com/rembo10/headphones.git
synced 2026-05-02 09:49:36 +01:00
Update Beautiful Soup
This commit is contained in:
@@ -3,10 +3,14 @@ __license__ = "MIT"
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import re
|
||||
import warnings
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
RubyParenthesisString,
|
||||
RubyTextString,
|
||||
Stylesheet,
|
||||
Script,
|
||||
TemplateString,
|
||||
@@ -28,6 +32,12 @@ XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
class XMLParsedAsHTMLWarning(UserWarning):
|
||||
"""The warning issued when an HTML parser is used to parse
|
||||
XML that is not XHTML.
|
||||
"""
|
||||
MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
||||
@@ -112,7 +122,7 @@ class TreeBuilder(object):
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
|
||||
|
||||
# Whitespace should be preserved inside these tags.
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||
@@ -319,7 +329,7 @@ class TreeBuilder(object):
|
||||
values = value
|
||||
attrs[attr] = values
|
||||
return attrs
|
||||
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events.
|
||||
|
||||
@@ -390,17 +400,25 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||
# you need to use it.
|
||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
# The HTML standard defines an unusual content model for these tags.
|
||||
# We represent this by using a string class other than NavigableString
|
||||
# inside these tags.
|
||||
# These HTML tags need special treatment so they can be
|
||||
# represented by a string class other than NavigableString.
|
||||
#
|
||||
# I made this list by going through the HTML spec
|
||||
# For some of these tags, it's because the HTML standard defines
|
||||
# an unusual content model for them. I made this list by going
|
||||
# through the HTML spec
|
||||
# (https://html.spec.whatwg.org/#metadata-content) and looking for
|
||||
# "metadata content" elements that can contain strings.
|
||||
#
|
||||
# The Ruby tags (<rt> and <rp>) are here despite being normal
|
||||
# "phrasing content" tags, because the content they contain is
|
||||
# qualitatively different from other text in the document, and it
|
||||
# can be useful to be able to distinguish it.
|
||||
#
|
||||
# TODO: Arguably <noscript> could go here but it seems
|
||||
# qualitatively different from the other tags.
|
||||
DEFAULT_STRING_CONTAINERS = {
|
||||
'rt' : RubyTextString,
|
||||
'rp' : RubyParenthesisString,
|
||||
'style': Stylesheet,
|
||||
'script': Script,
|
||||
'template': TemplateString,
|
||||
@@ -431,7 +449,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||
}
|
||||
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
"""Replace the declared encoding in a <meta> tag with a placeholder,
|
||||
to be substituted when the tag is output to a string.
|
||||
@@ -475,6 +493,104 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
class DetectsXMLParsedAsHTML(object):
|
||||
"""A mixin class for any class (a TreeBuilder, or some class used by a
|
||||
TreeBuilder) that's in a position to detect whether an XML
|
||||
document is being incorrectly parsed as HTML, and issue an
|
||||
appropriate warning.
|
||||
|
||||
This requires being able to observe an incoming processing
|
||||
instruction that might be an XML declaration, and also able to
|
||||
observe tags as they're opened. If you can't do that for a given
|
||||
TreeBuilder, there's a less reliable implementation based on
|
||||
examining the raw markup.
|
||||
"""
|
||||
|
||||
# Regular expression for seeing if markup has an <html> tag.
|
||||
LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
|
||||
LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
|
||||
|
||||
XML_PREFIX = '<?xml'
|
||||
XML_PREFIX_B = b'<?xml'
|
||||
|
||||
@classmethod
|
||||
def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
|
||||
"""Perform a check on some markup to see if it looks like XML
|
||||
that's not XHTML. If so, issue a warning.
|
||||
|
||||
This is much less reliable than doing the check while parsing,
|
||||
but some of the tree builders can't do that.
|
||||
|
||||
:param stacklevel: The stacklevel of the code calling this
|
||||
function.
|
||||
|
||||
:return: True if the markup looks like non-XHTML XML, False
|
||||
otherwise.
|
||||
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
prefix = cls.XML_PREFIX_B
|
||||
looks_like_html = cls.LOOKS_LIKE_HTML_B
|
||||
else:
|
||||
prefix = cls.XML_PREFIX
|
||||
looks_like_html = cls.LOOKS_LIKE_HTML
|
||||
|
||||
if (markup is not None
|
||||
and markup.startswith(prefix)
|
||||
and not looks_like_html.search(markup[:500])
|
||||
):
|
||||
cls._warn(stacklevel=stacklevel+2)
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _warn(cls, stacklevel=5):
|
||||
"""Issue a warning about XML being parsed as HTML."""
|
||||
warnings.warn(
|
||||
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
|
||||
stacklevel=stacklevel
|
||||
)
|
||||
|
||||
def _initialize_xml_detector(self):
|
||||
"""Call this method before parsing a document."""
|
||||
self._first_processing_instruction = None
|
||||
self._root_tag = None
|
||||
|
||||
def _document_might_be_xml(self, processing_instruction):
|
||||
"""Call this method when encountering an XML declaration, or a
|
||||
"processing instruction" that might be an XML declaration.
|
||||
"""
|
||||
if (self._first_processing_instruction is not None
|
||||
or self._root_tag is not None):
|
||||
# The document has already started. Don't bother checking
|
||||
# anymore.
|
||||
return
|
||||
|
||||
self._first_processing_instruction = processing_instruction
|
||||
|
||||
# We won't know until we encounter the first tag whether or
|
||||
# not this is actually a problem.
|
||||
|
||||
def _root_tag_encountered(self, name):
|
||||
"""Call this when you encounter the document's root tag.
|
||||
|
||||
This is where we actually check whether an XML document is
|
||||
being incorrectly parsed as HTML, and issue the warning.
|
||||
"""
|
||||
if self._root_tag is not None:
|
||||
# This method was incorrectly called multiple times. Do
|
||||
# nothing.
|
||||
return
|
||||
|
||||
self._root_tag = name
|
||||
if (name != 'html' and self._first_processing_instruction is not None
|
||||
and self._first_processing_instruction.lower().startswith('xml ')):
|
||||
# We encountered an XML declaration and then a tag other
|
||||
# than 'html'. This is a reliable indicator that a
|
||||
# non-XHTML document is being parsed as XML.
|
||||
self._warn()
|
||||
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
this_module = sys.modules[__name__]
|
||||
|
||||
@@ -8,6 +8,7 @@ __all__ = [
|
||||
import warnings
|
||||
import re
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
@@ -69,13 +70,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
warnings.warn(
|
||||
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
|
||||
stacklevel=3
|
||||
)
|
||||
|
||||
# html5lib only parses HTML, so if it's given XML that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
|
||||
markup, stacklevel=3
|
||||
)
|
||||
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
warnings.warn(
|
||||
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
|
||||
stacklevel=4
|
||||
)
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
self.underlying_builder.parser = parser
|
||||
extra_kwargs = dict()
|
||||
@@ -242,10 +256,10 @@ class AttrList(object):
|
||||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = self.element.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
list_attr = self.element.cdata_list_attributes or {}
|
||||
if (name in list_attr.get('*', [])
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
and name in list_attr.get(self.element.name, []))):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
|
||||
@@ -10,30 +10,9 @@ __all__ = [
|
||||
|
||||
from html.parser import HTMLParser
|
||||
|
||||
try:
|
||||
from html.parser import HTMLParseError
|
||||
except ImportError as e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
@@ -44,6 +23,8 @@ from bs4.element import (
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
ParserRejectedMarkup,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
@@ -52,7 +33,7 @@ from bs4.builder import (
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||
"""A subclass of the Python standard library's HTMLParser class, which
|
||||
listens for HTMLParser events and translates them into calls
|
||||
to Beautiful Soup's tree construction API.
|
||||
@@ -88,19 +69,24 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
def error(self, msg):
|
||||
"""In Python 3, HTMLParser subclasses must implement error(), although
|
||||
this requirement doesn't appear to be documented.
|
||||
self._initialize_xml_detector()
|
||||
|
||||
In Python 2, HTMLParser implements error() by raising an exception,
|
||||
which we don't want to do.
|
||||
def error(self, message):
|
||||
# NOTE: This method is required so long as Python 3.9 is
|
||||
# supported. The corresponding code is removed from HTMLParser
|
||||
# in 3.5, but not removed from ParserBase until 3.10.
|
||||
# https://github.com/python/cpython/issues/76025
|
||||
#
|
||||
# The original implementation turned the error into a warning,
|
||||
# but in every case I discovered, this made HTMLParser
|
||||
# immediately crash with an error message that was less
|
||||
# helpful than the warning. The new implementation makes it
|
||||
# more clear that html.parser just can't parse this
|
||||
# markup. The 3.10 implementation does the same, though it
|
||||
# raises AssertionError rather than calling a method. (We
|
||||
# catch this error and wrap it in a ParserRejectedMarkup.)
|
||||
raise ParserRejectedMarkup(message)
|
||||
|
||||
In any event, this method is called only on very strange
|
||||
markup and our best strategy is to pretend it didn't happen
|
||||
and keep going.
|
||||
"""
|
||||
warnings.warn(msg)
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
"""Handle an incoming empty-element tag.
|
||||
|
||||
@@ -167,6 +153,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
if self._root_tag is None:
|
||||
self._root_tag_encountered(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
"""Handle a closing tag, e.g. '</tag>'
|
||||
@@ -185,7 +174,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
self.already_closed_empty_element.remove(name)
|
||||
else:
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
|
||||
def handle_data(self, data):
|
||||
"""Handle some textual data that shows up between tags."""
|
||||
self.soup.handle_data(data)
|
||||
@@ -197,9 +186,10 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
:param name: Character number, possibly in hexadecimal.
|
||||
"""
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed in all supported versions.
|
||||
# http://bugs.python.org/issue13633
|
||||
# TODO: This was originally a workaround for a bug in
|
||||
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
|
||||
# been fixed, but removing this code still makes some
|
||||
# Beautiful Soup tests fail. This needs investigation.
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
@@ -288,6 +278,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
"""
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self._document_might_be_xml(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
@@ -326,10 +317,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
parser_args = parser_args or []
|
||||
parser_kwargs = parser_kwargs or {}
|
||||
parser_kwargs.update(extra_parser_kwargs)
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
parser_kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
parser_kwargs['convert_charrefs'] = False
|
||||
parser_kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (parser_args, parser_kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
@@ -391,102 +379,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
try:
|
||||
parser.feed(markup)
|
||||
parser.close()
|
||||
except HTMLParseError as e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
except AssertionError as e:
|
||||
# html.parser raises AssertionError in rare cases to
|
||||
# indicate a fatal problem with the markup, especially
|
||||
# when there's an error in the doctype declaration.
|
||||
raise ParserRejectedMarkup(e)
|
||||
parser.already_closed_empty_element = []
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
|
||||
@@ -22,6 +22,7 @@ from bs4.element import (
|
||||
XMLProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
@@ -79,15 +80,24 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
|
||||
This will track (almost) all namespaces, even ones that were
|
||||
only in scope for part of the document. If two namespaces have
|
||||
the same prefix, only the first one encountered will be
|
||||
tracked. Un-prefixed namespaces are not tracked.
|
||||
|
||||
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||||
"""
|
||||
for key, value in list(mapping.items()):
|
||||
# This is 'if key' and not 'if key is not None' because we
|
||||
# don't track un-prefixed namespaces. Soupselect will
|
||||
# treat an un-prefixed namespace as the default, which
|
||||
# causes confusion in some cases.
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
# prefix, the first one in the document takes precedence.
|
||||
self.soup._namespaces[key] = value
|
||||
|
||||
|
||||
def default_parser(self, encoding):
|
||||
"""Find the default parser for the given encoding.
|
||||
|
||||
@@ -125,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
@@ -166,12 +177,23 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
# We're in HTML mode, so if we're given XML, that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
|
||||
markup, stacklevel=3
|
||||
)
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
|
||||
# TODO: This is a workaround for
|
||||
# https://bugs.launchpad.net/lxml/+bug/1948551.
|
||||
# We can remove it once the upstream issue is fixed.
|
||||
if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
|
||||
markup = markup[1:]
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, str):
|
||||
@@ -240,6 +262,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# The currently active namespace prefixes have
|
||||
# changed. Calculate the new mapping so it can be stored
|
||||
# with all Tag objects created while these prefixes are in
|
||||
# scope.
|
||||
current_mapping = dict(self.active_namespace_prefixes[-1])
|
||||
current_mapping.update(nsmap)
|
||||
|
||||
# We should not track un-prefixed namespaces as we can only hold one
|
||||
# and it will be recognized as the default namespace by soupsieve,
|
||||
# which may be confusing in some situations.
|
||||
if '' in current_mapping:
|
||||
del current_mapping['']
|
||||
self.active_namespace_prefixes.append(current_mapping)
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
@@ -264,8 +300,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
self.soup.handle_starttag(
|
||||
name, namespace, nsprefix, attrs,
|
||||
namespaces=self.active_namespace_prefixes[-1]
|
||||
)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
@@ -289,13 +328,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
out_of_scope_nsmap = self.nsmaps.pop()
|
||||
|
||||
if out_of_scope_nsmap is not None:
|
||||
# This tag introduced a namespace mapping which is no
|
||||
# longer in scope. Recalculate the currently active
|
||||
# namespace prefixes.
|
||||
self.active_namespace_prefixes.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
data = target + ' ' + data
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user