mirror of
https://github.com/rembo10/headphones.git
synced 2026-03-21 12:19:27 +00:00
BeautifulSoup needs lxml or html5, have included html5lib. Also latest BeautifulSoup 4.1.3
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
try:
|
|
frozenset
|
|
except NameError:
|
|
# Import from the sets module for python 2.3
|
|
from sets import ImmutableSet as frozenset
|
|
|
|
import re
|
|
|
|
import _base
|
|
from html5lib.constants import rcdataElements, spaceCharacters
|
|
spaceCharacters = u"".join(spaceCharacters)
|
|
|
|
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
|
|
|
class Filter(_base.Filter):
|
|
|
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
|
|
|
def __iter__(self):
|
|
preserve = 0
|
|
for token in _base.Filter.__iter__(self):
|
|
type = token["type"]
|
|
if type == "StartTag" \
|
|
and (preserve or token["name"] in self.spacePreserveElements):
|
|
preserve += 1
|
|
|
|
elif type == "EndTag" and preserve:
|
|
preserve -= 1
|
|
|
|
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
|
# Test on token["data"] above to not introduce spaces where there were not
|
|
token["data"] = u" "
|
|
|
|
elif not preserve and type == "Characters":
|
|
token["data"] = collapse_spaces(token["data"])
|
|
|
|
yield token
|
|
|
|
def collapse_spaces(text):
|
|
return SPACES_REGEX.sub(' ', text)
|
|
|