mirror of
https://github.com/rembo10/headphones.git
synced 2026-03-21 20:29:27 +00:00
- Delete from Releases when deleting artist/album - Searcher - Size limits not quite working - Searcher - 1st newznab used even if disabled - Rutracker search stopped working for me, fixed by updating Beautiful Soup. Moved bs4 and html5lib to lib and ensured (I think) it’s imported from the right place
321 lines
13 KiB
Python
321 lines
13 KiB
Python
from __future__ import absolute_import, division, unicode_literals
|
|
from six import text_type
|
|
|
|
import gettext
|
|
_ = gettext.gettext
|
|
|
|
try:
|
|
from functools import reduce
|
|
except ImportError:
|
|
pass
|
|
|
|
from ..constants import voidElements, booleanAttributes, spaceCharacters
|
|
from ..constants import rcdataElements, entities, xmlEntities
|
|
from .. import utils
|
|
from xml.sax.saxutils import escape
|
|
|
|
spaceCharacters = "".join(spaceCharacters)
|
|
|
|
try:
|
|
from codecs import register_error, xmlcharrefreplace_errors
|
|
except ImportError:
|
|
unicode_encode_errors = "strict"
|
|
else:
|
|
unicode_encode_errors = "htmlentityreplace"
|
|
|
|
encode_entity_map = {}
|
|
is_ucs4 = len("\U0010FFFF") == 1
|
|
for k, v in list(entities.items()):
|
|
# skip multi-character entities
|
|
if ((is_ucs4 and len(v) > 1) or
|
|
(not is_ucs4 and len(v) > 2)):
|
|
continue
|
|
if v != "&":
|
|
if len(v) == 2:
|
|
v = utils.surrogatePairToCodepoint(v)
|
|
else:
|
|
v = ord(v)
|
|
if not v in encode_entity_map or k.islower():
|
|
# prefer < over < and similarly for &, >, etc.
|
|
encode_entity_map[v] = k
|
|
|
|
def htmlentityreplace_errors(exc):
|
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
|
res = []
|
|
codepoints = []
|
|
skip = False
|
|
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
|
if skip:
|
|
skip = False
|
|
continue
|
|
index = i + exc.start
|
|
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
|
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
|
skip = True
|
|
else:
|
|
codepoint = ord(c)
|
|
codepoints.append(codepoint)
|
|
for cp in codepoints:
|
|
e = encode_entity_map.get(cp)
|
|
if e:
|
|
res.append("&")
|
|
res.append(e)
|
|
if not e.endswith(";"):
|
|
res.append(";")
|
|
else:
|
|
res.append("&#x%s;" % (hex(cp)[2:]))
|
|
return ("".join(res), exc.end)
|
|
else:
|
|
return xmlcharrefreplace_errors(exc)
|
|
|
|
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
|
|
|
del register_error
|
|
|
|
|
|
class HTMLSerializer(object):
|
|
|
|
# attribute quoting options
|
|
quote_attr_values = False
|
|
quote_char = '"'
|
|
use_best_quote_char = True
|
|
|
|
# tag syntax options
|
|
omit_optional_tags = True
|
|
minimize_boolean_attributes = True
|
|
use_trailing_solidus = False
|
|
space_before_trailing_solidus = True
|
|
|
|
# escaping options
|
|
escape_lt_in_attrs = False
|
|
escape_rcdata = False
|
|
resolve_entities = True
|
|
|
|
# miscellaneous options
|
|
alphabetical_attributes = False
|
|
inject_meta_charset = True
|
|
strip_whitespace = False
|
|
sanitize = False
|
|
|
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
|
"omit_optional_tags", "minimize_boolean_attributes",
|
|
"use_trailing_solidus", "space_before_trailing_solidus",
|
|
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
|
"alphabetical_attributes", "inject_meta_charset",
|
|
"strip_whitespace", "sanitize")
|
|
|
|
def __init__(self, **kwargs):
|
|
"""Initialize HTMLSerializer.
|
|
|
|
Keyword options (default given first unless specified) include:
|
|
|
|
inject_meta_charset=True|False
|
|
Whether it insert a meta element to define the character set of the
|
|
document.
|
|
quote_attr_values=True|False
|
|
Whether to quote attribute values that don't require quoting
|
|
per HTML5 parsing rules.
|
|
quote_char=u'"'|u"'"
|
|
Use given quote character for attribute quoting. Default is to
|
|
use double quote unless attribute value contains a double quote,
|
|
in which case single quotes are used instead.
|
|
escape_lt_in_attrs=False|True
|
|
Whether to escape < in attribute values.
|
|
escape_rcdata=False|True
|
|
Whether to escape characters that need to be escaped within normal
|
|
elements within rcdata elements such as style.
|
|
resolve_entities=True|False
|
|
Whether to resolve named character entities that appear in the
|
|
source tree. The XML predefined entities < > & " '
|
|
are unaffected by this setting.
|
|
strip_whitespace=False|True
|
|
Whether to remove semantically meaningless whitespace. (This
|
|
compresses all whitespace to a single space except within pre.)
|
|
minimize_boolean_attributes=True|False
|
|
Shortens boolean attributes to give just the attribute value,
|
|
for example <input disabled="disabled"> becomes <input disabled>.
|
|
use_trailing_solidus=False|True
|
|
Includes a close-tag slash at the end of the start tag of void
|
|
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
|
space_before_trailing_solidus=True|False
|
|
Places a space immediately before the closing slash in a tag
|
|
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
|
sanitize=False|True
|
|
Strip all unsafe or unknown constructs from output.
|
|
See `html5lib user documentation`_
|
|
omit_optional_tags=True|False
|
|
Omit start/end tags that are optional.
|
|
alphabetical_attributes=False|True
|
|
Reorder attributes to be in alphabetical order.
|
|
|
|
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
|
"""
|
|
if 'quote_char' in kwargs:
|
|
self.use_best_quote_char = False
|
|
for attr in self.options:
|
|
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
|
self.errors = []
|
|
self.strict = False
|
|
|
|
def encode(self, string):
|
|
assert(isinstance(string, text_type))
|
|
if self.encoding:
|
|
return string.encode(self.encoding, unicode_encode_errors)
|
|
else:
|
|
return string
|
|
|
|
def encodeStrict(self, string):
|
|
assert(isinstance(string, text_type))
|
|
if self.encoding:
|
|
return string.encode(self.encoding, "strict")
|
|
else:
|
|
return string
|
|
|
|
def serialize(self, treewalker, encoding=None):
|
|
self.encoding = encoding
|
|
in_cdata = False
|
|
self.errors = []
|
|
|
|
if encoding and self.inject_meta_charset:
|
|
from ..filters.inject_meta_charset import Filter
|
|
treewalker = Filter(treewalker, encoding)
|
|
# WhitespaceFilter should be used before OptionalTagFilter
|
|
# for maximum efficiently of this latter filter
|
|
if self.strip_whitespace:
|
|
from ..filters.whitespace import Filter
|
|
treewalker = Filter(treewalker)
|
|
if self.sanitize:
|
|
from ..filters.sanitizer import Filter
|
|
treewalker = Filter(treewalker)
|
|
if self.omit_optional_tags:
|
|
from ..filters.optionaltags import Filter
|
|
treewalker = Filter(treewalker)
|
|
# Alphabetical attributes must be last, as other filters
|
|
# could add attributes and alter the order
|
|
if self.alphabetical_attributes:
|
|
from ..filters.alphabeticalattributes import Filter
|
|
treewalker = Filter(treewalker)
|
|
|
|
for token in treewalker:
|
|
type = token["type"]
|
|
if type == "Doctype":
|
|
doctype = "<!DOCTYPE %s" % token["name"]
|
|
|
|
if token["publicId"]:
|
|
doctype += ' PUBLIC "%s"' % token["publicId"]
|
|
elif token["systemId"]:
|
|
doctype += " SYSTEM"
|
|
if token["systemId"]:
|
|
if token["systemId"].find('"') >= 0:
|
|
if token["systemId"].find("'") >= 0:
|
|
self.serializeError(_("System identifer contains both single and double quote characters"))
|
|
quote_char = "'"
|
|
else:
|
|
quote_char = '"'
|
|
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
|
|
|
doctype += ">"
|
|
yield self.encodeStrict(doctype)
|
|
|
|
elif type in ("Characters", "SpaceCharacters"):
|
|
if type == "SpaceCharacters" or in_cdata:
|
|
if in_cdata and token["data"].find("</") >= 0:
|
|
self.serializeError(_("Unexpected </ in CDATA"))
|
|
yield self.encode(token["data"])
|
|
else:
|
|
yield self.encode(escape(token["data"]))
|
|
|
|
elif type in ("StartTag", "EmptyTag"):
|
|
name = token["name"]
|
|
yield self.encodeStrict("<%s" % name)
|
|
if name in rcdataElements and not self.escape_rcdata:
|
|
in_cdata = True
|
|
elif in_cdata:
|
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
|
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
|
# TODO: Add namespace support here
|
|
k = attr_name
|
|
v = attr_value
|
|
yield self.encodeStrict(' ')
|
|
|
|
yield self.encodeStrict(k)
|
|
if not self.minimize_boolean_attributes or \
|
|
(k not in booleanAttributes.get(name, tuple())
|
|
and k not in booleanAttributes.get("", tuple())):
|
|
yield self.encodeStrict("=")
|
|
if self.quote_attr_values or not v:
|
|
quote_attr = True
|
|
else:
|
|
quote_attr = reduce(lambda x, y: x or (y in v),
|
|
spaceCharacters + ">\"'=", False)
|
|
v = v.replace("&", "&")
|
|
if self.escape_lt_in_attrs:
|
|
v = v.replace("<", "<")
|
|
if quote_attr:
|
|
quote_char = self.quote_char
|
|
if self.use_best_quote_char:
|
|
if "'" in v and '"' not in v:
|
|
quote_char = '"'
|
|
elif '"' in v and "'" not in v:
|
|
quote_char = "'"
|
|
if quote_char == "'":
|
|
v = v.replace("'", "'")
|
|
else:
|
|
v = v.replace('"', """)
|
|
yield self.encodeStrict(quote_char)
|
|
yield self.encode(v)
|
|
yield self.encodeStrict(quote_char)
|
|
else:
|
|
yield self.encode(v)
|
|
if name in voidElements and self.use_trailing_solidus:
|
|
if self.space_before_trailing_solidus:
|
|
yield self.encodeStrict(" /")
|
|
else:
|
|
yield self.encodeStrict("/")
|
|
yield self.encode(">")
|
|
|
|
elif type == "EndTag":
|
|
name = token["name"]
|
|
if name in rcdataElements:
|
|
in_cdata = False
|
|
elif in_cdata:
|
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
|
yield self.encodeStrict("</%s>" % name)
|
|
|
|
elif type == "Comment":
|
|
data = token["data"]
|
|
if data.find("--") >= 0:
|
|
self.serializeError(_("Comment contains --"))
|
|
yield self.encodeStrict("<!--%s-->" % token["data"])
|
|
|
|
elif type == "Entity":
|
|
name = token["name"]
|
|
key = name + ";"
|
|
if not key in entities:
|
|
self.serializeError(_("Entity %s not recognized" % name))
|
|
if self.resolve_entities and key not in xmlEntities:
|
|
data = entities[key]
|
|
else:
|
|
data = "&%s;" % name
|
|
yield self.encodeStrict(data)
|
|
|
|
else:
|
|
self.serializeError(token["data"])
|
|
|
|
def render(self, treewalker, encoding=None):
|
|
if encoding:
|
|
return b"".join(list(self.serialize(treewalker, encoding)))
|
|
else:
|
|
return "".join(list(self.serialize(treewalker)))
|
|
|
|
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
|
# XXX The idea is to make data mandatory.
|
|
self.errors.append(data)
|
|
if self.strict:
|
|
raise SerializeError
|
|
|
|
|
|
def SerializeError(Exception):
|
|
"""Error in serialized tree"""
|
|
pass
|