mirror of
https://github.com/rembo10/headphones.git
synced 2026-03-21 04:09:26 +00:00
- Delete from Releases when deleting artist/album - Searcher - Size limits not quite working - Searcher - 1st newznab used even if disabled - Rutracker search stopped working for me, fixed by updating Beautiful Soup. Moved bs4 and html5lib to lib and ensured (I think) it’s imported from the right place
887 lines
30 KiB
Python
887 lines
30 KiB
Python
from __future__ import absolute_import, division, unicode_literals
|
|
from six import text_type
|
|
from six.moves import http_client
|
|
|
|
import codecs
|
|
import re
|
|
|
|
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
|
from .constants import encodings, ReparseException
|
|
from . import utils
|
|
|
|
from io import StringIO
|
|
|
|
try:
|
|
from io import BytesIO
|
|
except ImportError:
|
|
BytesIO = StringIO
|
|
|
|
try:
|
|
from io import BufferedIOBase
|
|
except ImportError:
|
|
class BufferedIOBase(object):
|
|
pass
|
|
|
|
# Non-unicode versions of constants for use in the pre-parser
|
|
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
|
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
|
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
|
|
|
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
|
|
|
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
|
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
|
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
|
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
|
0x10FFFE, 0x10FFFF])
|
|
|
|
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
|
|
|
# Cache for charsUntil()
|
|
charsUntilRegEx = {}
|
|
|
|
|
|
class BufferedStream(object):
|
|
"""Buffering for streams that do not have buffering of their own
|
|
|
|
The buffer is implemented as a list of chunks on the assumption that
|
|
joining many strings will be slow since it is O(n**2)
|
|
"""
|
|
|
|
def __init__(self, stream):
|
|
self.stream = stream
|
|
self.buffer = []
|
|
self.position = [-1, 0] # chunk number, offset
|
|
|
|
def tell(self):
|
|
pos = 0
|
|
for chunk in self.buffer[:self.position[0]]:
|
|
pos += len(chunk)
|
|
pos += self.position[1]
|
|
return pos
|
|
|
|
def seek(self, pos):
|
|
assert pos <= self._bufferedBytes()
|
|
offset = pos
|
|
i = 0
|
|
while len(self.buffer[i]) < offset:
|
|
offset -= len(self.buffer[i])
|
|
i += 1
|
|
self.position = [i, offset]
|
|
|
|
def read(self, bytes):
|
|
if not self.buffer:
|
|
return self._readStream(bytes)
|
|
elif (self.position[0] == len(self.buffer) and
|
|
self.position[1] == len(self.buffer[-1])):
|
|
return self._readStream(bytes)
|
|
else:
|
|
return self._readFromBuffer(bytes)
|
|
|
|
def _bufferedBytes(self):
|
|
return sum([len(item) for item in self.buffer])
|
|
|
|
def _readStream(self, bytes):
|
|
data = self.stream.read(bytes)
|
|
self.buffer.append(data)
|
|
self.position[0] += 1
|
|
self.position[1] = len(data)
|
|
return data
|
|
|
|
def _readFromBuffer(self, bytes):
|
|
remainingBytes = bytes
|
|
rv = []
|
|
bufferIndex = self.position[0]
|
|
bufferOffset = self.position[1]
|
|
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
|
assert remainingBytes > 0
|
|
bufferedData = self.buffer[bufferIndex]
|
|
|
|
if remainingBytes <= len(bufferedData) - bufferOffset:
|
|
bytesToRead = remainingBytes
|
|
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
|
else:
|
|
bytesToRead = len(bufferedData) - bufferOffset
|
|
self.position = [bufferIndex, len(bufferedData)]
|
|
bufferIndex += 1
|
|
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
|
remainingBytes -= bytesToRead
|
|
|
|
bufferOffset = 0
|
|
|
|
if remainingBytes:
|
|
rv.append(self._readStream(remainingBytes))
|
|
|
|
return b"".join(rv)
|
|
|
|
|
|
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
|
if isinstance(source, http_client.HTTPResponse):
|
|
# Work around Python bug #20007: read(0) closes the connection.
|
|
# http://bugs.python.org/issue20007
|
|
isUnicode = False
|
|
elif hasattr(source, "read"):
|
|
isUnicode = isinstance(source.read(0), text_type)
|
|
else:
|
|
isUnicode = isinstance(source, text_type)
|
|
|
|
if isUnicode:
|
|
if encoding is not None:
|
|
raise TypeError("Cannot explicitly set an encoding with a unicode string")
|
|
|
|
return HTMLUnicodeInputStream(source)
|
|
else:
|
|
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
|
|
|
|
|
|
class HTMLUnicodeInputStream(object):
|
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
|
|
|
This class takes care of character encoding and removing or replacing
|
|
incorrect byte-sequences and also provides column and line tracking.
|
|
|
|
"""
|
|
|
|
_defaultChunkSize = 10240
|
|
|
|
def __init__(self, source):
|
|
"""Initialises the HTMLInputStream.
|
|
|
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
|
for use by html5lib.
|
|
|
|
source can be either a file-object, local filename or a string.
|
|
|
|
The optional encoding parameter must be a string that indicates
|
|
the encoding. If specified, that encoding will be used,
|
|
regardless of any BOM or later declaration (such as in a meta
|
|
element)
|
|
|
|
parseMeta - Look for a <meta> element containing encoding information
|
|
|
|
"""
|
|
|
|
# Craziness
|
|
if len("\U0010FFFF") == 1:
|
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
|
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
|
else:
|
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
|
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
|
|
|
# List of where new lines occur
|
|
self.newLines = [0]
|
|
|
|
self.charEncoding = ("utf-8", "certain")
|
|
self.dataStream = self.openStream(source)
|
|
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.chunk = ""
|
|
self.chunkSize = 0
|
|
self.chunkOffset = 0
|
|
self.errors = []
|
|
|
|
# number of (complete) lines in previous chunks
|
|
self.prevNumLines = 0
|
|
# number of columns in the last line of the previous chunk
|
|
self.prevNumCols = 0
|
|
|
|
# Deal with CR LF and surrogates split over chunk boundaries
|
|
self._bufferedCharacter = None
|
|
|
|
def openStream(self, source):
|
|
"""Produces a file object from source.
|
|
|
|
source can be either a file object, local filename or a string.
|
|
|
|
"""
|
|
# Already a file object
|
|
if hasattr(source, 'read'):
|
|
stream = source
|
|
else:
|
|
stream = StringIO(source)
|
|
|
|
return stream
|
|
|
|
def _position(self, offset):
|
|
chunk = self.chunk
|
|
nLines = chunk.count('\n', 0, offset)
|
|
positionLine = self.prevNumLines + nLines
|
|
lastLinePos = chunk.rfind('\n', 0, offset)
|
|
if lastLinePos == -1:
|
|
positionColumn = self.prevNumCols + offset
|
|
else:
|
|
positionColumn = offset - (lastLinePos + 1)
|
|
return (positionLine, positionColumn)
|
|
|
|
def position(self):
|
|
"""Returns (line, col) of the current position in the stream."""
|
|
line, col = self._position(self.chunkOffset)
|
|
return (line + 1, col)
|
|
|
|
def char(self):
|
|
""" Read one character from the stream or queue if available. Return
|
|
EOF when EOF is reached.
|
|
"""
|
|
# Read a new chunk from the input stream if necessary
|
|
if self.chunkOffset >= self.chunkSize:
|
|
if not self.readChunk():
|
|
return EOF
|
|
|
|
chunkOffset = self.chunkOffset
|
|
char = self.chunk[chunkOffset]
|
|
self.chunkOffset = chunkOffset + 1
|
|
|
|
return char
|
|
|
|
def readChunk(self, chunkSize=None):
|
|
if chunkSize is None:
|
|
chunkSize = self._defaultChunkSize
|
|
|
|
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
|
|
|
self.chunk = ""
|
|
self.chunkSize = 0
|
|
self.chunkOffset = 0
|
|
|
|
data = self.dataStream.read(chunkSize)
|
|
|
|
# Deal with CR LF and surrogates broken across chunks
|
|
if self._bufferedCharacter:
|
|
data = self._bufferedCharacter + data
|
|
self._bufferedCharacter = None
|
|
elif not data:
|
|
# We have no more data, bye-bye stream
|
|
return False
|
|
|
|
if len(data) > 1:
|
|
lastv = ord(data[-1])
|
|
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
|
self._bufferedCharacter = data[-1]
|
|
data = data[:-1]
|
|
|
|
self.reportCharacterErrors(data)
|
|
|
|
# Replace invalid characters
|
|
# Note U+0000 is dealt with in the tokenizer
|
|
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
|
|
|
data = data.replace("\r\n", "\n")
|
|
data = data.replace("\r", "\n")
|
|
|
|
self.chunk = data
|
|
self.chunkSize = len(data)
|
|
|
|
return True
|
|
|
|
def characterErrorsUCS4(self, data):
|
|
for i in range(len(invalid_unicode_re.findall(data))):
|
|
self.errors.append("invalid-codepoint")
|
|
|
|
def characterErrorsUCS2(self, data):
|
|
# Someone picked the wrong compile option
|
|
# You lose
|
|
skip = False
|
|
for match in invalid_unicode_re.finditer(data):
|
|
if skip:
|
|
continue
|
|
codepoint = ord(match.group())
|
|
pos = match.start()
|
|
# Pretty sure there should be endianness issues here
|
|
if utils.isSurrogatePair(data[pos:pos + 2]):
|
|
# We have a surrogate pair!
|
|
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
|
if char_val in non_bmp_invalid_codepoints:
|
|
self.errors.append("invalid-codepoint")
|
|
skip = True
|
|
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
|
pos == len(data) - 1):
|
|
self.errors.append("invalid-codepoint")
|
|
else:
|
|
skip = False
|
|
self.errors.append("invalid-codepoint")
|
|
|
|
def charsUntil(self, characters, opposite=False):
|
|
""" Returns a string of characters from the stream up to but not
|
|
including any character in 'characters' or EOF. 'characters' must be
|
|
a container that supports the 'in' method and iteration over its
|
|
characters.
|
|
"""
|
|
|
|
# Use a cache of regexps to find the required characters
|
|
try:
|
|
chars = charsUntilRegEx[(characters, opposite)]
|
|
except KeyError:
|
|
if __debug__:
|
|
for c in characters:
|
|
assert(ord(c) < 128)
|
|
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
|
if not opposite:
|
|
regex = "^%s" % regex
|
|
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
|
|
|
rv = []
|
|
|
|
while True:
|
|
# Find the longest matching prefix
|
|
m = chars.match(self.chunk, self.chunkOffset)
|
|
if m is None:
|
|
# If nothing matched, and it wasn't because we ran out of chunk,
|
|
# then stop
|
|
if self.chunkOffset != self.chunkSize:
|
|
break
|
|
else:
|
|
end = m.end()
|
|
# If not the whole chunk matched, return everything
|
|
# up to the part that didn't match
|
|
if end != self.chunkSize:
|
|
rv.append(self.chunk[self.chunkOffset:end])
|
|
self.chunkOffset = end
|
|
break
|
|
# If the whole remainder of the chunk matched,
|
|
# use it all and read the next chunk
|
|
rv.append(self.chunk[self.chunkOffset:])
|
|
if not self.readChunk():
|
|
# Reached EOF
|
|
break
|
|
|
|
r = "".join(rv)
|
|
return r
|
|
|
|
def unget(self, char):
|
|
# Only one character is allowed to be ungotten at once - it must
|
|
# be consumed again before any further call to unget
|
|
if char is not None:
|
|
if self.chunkOffset == 0:
|
|
# unget is called quite rarely, so it's a good idea to do
|
|
# more work here if it saves a bit of work in the frequently
|
|
# called char and charsUntil.
|
|
# So, just prepend the ungotten character onto the current
|
|
# chunk:
|
|
self.chunk = char + self.chunk
|
|
self.chunkSize += 1
|
|
else:
|
|
self.chunkOffset -= 1
|
|
assert self.chunk[self.chunkOffset] == char
|
|
|
|
|
|
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
|
|
|
This class takes care of character encoding and removing or replacing
|
|
incorrect byte-sequences and also provides column and line tracking.
|
|
|
|
"""
|
|
|
|
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
|
"""Initialises the HTMLInputStream.
|
|
|
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
|
for use by html5lib.
|
|
|
|
source can be either a file-object, local filename or a string.
|
|
|
|
The optional encoding parameter must be a string that indicates
|
|
the encoding. If specified, that encoding will be used,
|
|
regardless of any BOM or later declaration (such as in a meta
|
|
element)
|
|
|
|
parseMeta - Look for a <meta> element containing encoding information
|
|
|
|
"""
|
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
|
# self.charEncoding as appropriate
|
|
self.rawStream = self.openStream(source)
|
|
|
|
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
|
|
|
self.charEncoding = (codecName(encoding), "certain")
|
|
|
|
# Encoding Information
|
|
# Number of bytes to use when looking for a meta element with
|
|
# encoding information
|
|
self.numBytesMeta = 512
|
|
# Number of bytes to use when using detecting encoding using chardet
|
|
self.numBytesChardet = 100
|
|
# Encoding to use if no other information can be found
|
|
self.defaultEncoding = "windows-1252"
|
|
|
|
# Detect encoding iff no explicit "transport level" encoding is supplied
|
|
if (self.charEncoding[0] is None):
|
|
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
|
|
|
# Call superclass
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
|
'replace')
|
|
HTMLUnicodeInputStream.reset(self)
|
|
|
|
def openStream(self, source):
|
|
"""Produces a file object from source.
|
|
|
|
source can be either a file object, local filename or a string.
|
|
|
|
"""
|
|
# Already a file object
|
|
if hasattr(source, 'read'):
|
|
stream = source
|
|
else:
|
|
stream = BytesIO(source)
|
|
|
|
try:
|
|
stream.seek(stream.tell())
|
|
except:
|
|
stream = BufferedStream(stream)
|
|
|
|
return stream
|
|
|
|
def detectEncoding(self, parseMeta=True, chardet=True):
|
|
# First look for a BOM
|
|
# This will also read past the BOM if present
|
|
encoding = self.detectBOM()
|
|
confidence = "certain"
|
|
# If there is no BOM need to look for meta elements with encoding
|
|
# information
|
|
if encoding is None and parseMeta:
|
|
encoding = self.detectEncodingMeta()
|
|
confidence = "tentative"
|
|
# Guess with chardet, if avaliable
|
|
if encoding is None and chardet:
|
|
confidence = "tentative"
|
|
try:
|
|
try:
|
|
from charade.universaldetector import UniversalDetector
|
|
except ImportError:
|
|
from chardet.universaldetector import UniversalDetector
|
|
buffers = []
|
|
detector = UniversalDetector()
|
|
while not detector.done:
|
|
buffer = self.rawStream.read(self.numBytesChardet)
|
|
assert isinstance(buffer, bytes)
|
|
if not buffer:
|
|
break
|
|
buffers.append(buffer)
|
|
detector.feed(buffer)
|
|
detector.close()
|
|
encoding = detector.result['encoding']
|
|
self.rawStream.seek(0)
|
|
except ImportError:
|
|
pass
|
|
# If all else fails use the default encoding
|
|
if encoding is None:
|
|
confidence = "tentative"
|
|
encoding = self.defaultEncoding
|
|
|
|
# Substitute for equivalent encodings:
|
|
encodingSub = {"iso-8859-1": "windows-1252"}
|
|
|
|
if encoding.lower() in encodingSub:
|
|
encoding = encodingSub[encoding.lower()]
|
|
|
|
return encoding, confidence
|
|
|
|
def changeEncoding(self, newEncoding):
|
|
assert self.charEncoding[1] != "certain"
|
|
newEncoding = codecName(newEncoding)
|
|
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
newEncoding = "utf-8"
|
|
if newEncoding is None:
|
|
return
|
|
elif newEncoding == self.charEncoding[0]:
|
|
self.charEncoding = (self.charEncoding[0], "certain")
|
|
else:
|
|
self.rawStream.seek(0)
|
|
self.reset()
|
|
self.charEncoding = (newEncoding, "certain")
|
|
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
|
|
|
def detectBOM(self):
|
|
"""Attempts to detect at BOM at the start of the stream. If
|
|
an encoding can be determined from the BOM return the name of the
|
|
encoding otherwise return None"""
|
|
bomDict = {
|
|
codecs.BOM_UTF8: 'utf-8',
|
|
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
|
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
|
}
|
|
|
|
# Go to beginning of file and read in 4 bytes
|
|
string = self.rawStream.read(4)
|
|
assert isinstance(string, bytes)
|
|
|
|
# Try detecting the BOM using bytes from the string
|
|
encoding = bomDict.get(string[:3]) # UTF-8
|
|
seek = 3
|
|
if not encoding:
|
|
# Need to detect UTF-32 before UTF-16
|
|
encoding = bomDict.get(string) # UTF-32
|
|
seek = 4
|
|
if not encoding:
|
|
encoding = bomDict.get(string[:2]) # UTF-16
|
|
seek = 2
|
|
|
|
# Set the read position past the BOM if one was found, otherwise
|
|
# set it to the start of the stream
|
|
self.rawStream.seek(encoding and seek or 0)
|
|
|
|
return encoding
|
|
|
|
def detectEncodingMeta(self):
|
|
"""Report the encoding declared by the meta element
|
|
"""
|
|
buffer = self.rawStream.read(self.numBytesMeta)
|
|
assert isinstance(buffer, bytes)
|
|
parser = EncodingParser(buffer)
|
|
self.rawStream.seek(0)
|
|
encoding = parser.getEncoding()
|
|
|
|
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
encoding = "utf-8"
|
|
|
|
return encoding
|
|
|
|
|
|
class EncodingBytes(bytes):
|
|
"""String-like object with an associated position and various extra methods
|
|
If the position is ever greater than the string length then an exception is
|
|
raised"""
|
|
def __new__(self, value):
|
|
assert isinstance(value, bytes)
|
|
return bytes.__new__(self, value.lower())
|
|
|
|
def __init__(self, value):
|
|
self._position = -1
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
p = self._position = self._position + 1
|
|
if p >= len(self):
|
|
raise StopIteration
|
|
elif p < 0:
|
|
raise TypeError
|
|
return self[p:p + 1]
|
|
|
|
def next(self):
|
|
# Py2 compat
|
|
return self.__next__()
|
|
|
|
def previous(self):
|
|
p = self._position
|
|
if p >= len(self):
|
|
raise StopIteration
|
|
elif p < 0:
|
|
raise TypeError
|
|
self._position = p = p - 1
|
|
return self[p:p + 1]
|
|
|
|
def setPosition(self, position):
|
|
if self._position >= len(self):
|
|
raise StopIteration
|
|
self._position = position
|
|
|
|
def getPosition(self):
|
|
if self._position >= len(self):
|
|
raise StopIteration
|
|
if self._position >= 0:
|
|
return self._position
|
|
else:
|
|
return None
|
|
|
|
position = property(getPosition, setPosition)
|
|
|
|
def getCurrentByte(self):
|
|
return self[self.position:self.position + 1]
|
|
|
|
currentByte = property(getCurrentByte)
|
|
|
|
def skip(self, chars=spaceCharactersBytes):
|
|
"""Skip past a list of characters"""
|
|
p = self.position # use property for the error-checking
|
|
while p < len(self):
|
|
c = self[p:p + 1]
|
|
if c not in chars:
|
|
self._position = p
|
|
return c
|
|
p += 1
|
|
self._position = p
|
|
return None
|
|
|
|
def skipUntil(self, chars):
|
|
p = self.position
|
|
while p < len(self):
|
|
c = self[p:p + 1]
|
|
if c in chars:
|
|
self._position = p
|
|
return c
|
|
p += 1
|
|
self._position = p
|
|
return None
|
|
|
|
def matchBytes(self, bytes):
|
|
"""Look for a sequence of bytes at the start of a string. If the bytes
|
|
are found return True and advance the position to the byte after the
|
|
match. Otherwise return False and leave the position alone"""
|
|
p = self.position
|
|
data = self[p:p + len(bytes)]
|
|
rv = data.startswith(bytes)
|
|
if rv:
|
|
self.position += len(bytes)
|
|
return rv
|
|
|
|
def jumpTo(self, bytes):
|
|
"""Look for the next sequence of bytes matching a given sequence. If
|
|
a match is found advance the position to the last byte of the match"""
|
|
newPosition = self[self.position:].find(bytes)
|
|
if newPosition > -1:
|
|
# XXX: This is ugly, but I can't see a nicer way to fix this.
|
|
if self._position == -1:
|
|
self._position = 0
|
|
self._position += (newPosition + len(bytes) - 1)
|
|
return True
|
|
else:
|
|
raise StopIteration
|
|
|
|
|
|
class EncodingParser(object):
|
|
"""Mini parser for detecting character encoding from meta elements"""
|
|
|
|
def __init__(self, data):
|
|
"""string - the data to work on for encoding detection"""
|
|
self.data = EncodingBytes(data)
|
|
self.encoding = None
|
|
|
|
def getEncoding(self):
|
|
methodDispatch = (
|
|
(b"<!--", self.handleComment),
|
|
(b"<meta", self.handleMeta),
|
|
(b"</", self.handlePossibleEndTag),
|
|
(b"<!", self.handleOther),
|
|
(b"<?", self.handleOther),
|
|
(b"<", self.handlePossibleStartTag))
|
|
for byte in self.data:
|
|
keepParsing = True
|
|
for key, method in methodDispatch:
|
|
if self.data.matchBytes(key):
|
|
try:
|
|
keepParsing = method()
|
|
break
|
|
except StopIteration:
|
|
keepParsing = False
|
|
break
|
|
if not keepParsing:
|
|
break
|
|
|
|
return self.encoding
|
|
|
|
def handleComment(self):
|
|
"""Skip over comments"""
|
|
return self.data.jumpTo(b"-->")
|
|
|
|
def handleMeta(self):
|
|
if self.data.currentByte not in spaceCharactersBytes:
|
|
# if we have <meta not followed by a space so just keep going
|
|
return True
|
|
# We have a valid meta element we want to search for attributes
|
|
hasPragma = False
|
|
pendingEncoding = None
|
|
while True:
|
|
# Try to find the next attribute after the current position
|
|
attr = self.getAttribute()
|
|
if attr is None:
|
|
return True
|
|
else:
|
|
if attr[0] == b"http-equiv":
|
|
hasPragma = attr[1] == b"content-type"
|
|
if hasPragma and pendingEncoding is not None:
|
|
self.encoding = pendingEncoding
|
|
return False
|
|
elif attr[0] == b"charset":
|
|
tentativeEncoding = attr[1]
|
|
codec = codecName(tentativeEncoding)
|
|
if codec is not None:
|
|
self.encoding = codec
|
|
return False
|
|
elif attr[0] == b"content":
|
|
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
|
tentativeEncoding = contentParser.parse()
|
|
if tentativeEncoding is not None:
|
|
codec = codecName(tentativeEncoding)
|
|
if codec is not None:
|
|
if hasPragma:
|
|
self.encoding = codec
|
|
return False
|
|
else:
|
|
pendingEncoding = codec
|
|
|
|
def handlePossibleStartTag(self):
|
|
return self.handlePossibleTag(False)
|
|
|
|
def handlePossibleEndTag(self):
|
|
next(self.data)
|
|
return self.handlePossibleTag(True)
|
|
|
|
def handlePossibleTag(self, endTag):
|
|
data = self.data
|
|
if data.currentByte not in asciiLettersBytes:
|
|
# If the next byte is not an ascii letter either ignore this
|
|
# fragment (possible start tag case) or treat it according to
|
|
# handleOther
|
|
if endTag:
|
|
data.previous()
|
|
self.handleOther()
|
|
return True
|
|
|
|
c = data.skipUntil(spacesAngleBrackets)
|
|
if c == b"<":
|
|
# return to the first step in the overall "two step" algorithm
|
|
# reprocessing the < byte
|
|
data.previous()
|
|
else:
|
|
# Read all attributes
|
|
attr = self.getAttribute()
|
|
while attr is not None:
|
|
attr = self.getAttribute()
|
|
return True
|
|
|
|
def handleOther(self):
|
|
return self.data.jumpTo(b">")
|
|
|
|
def getAttribute(self):
|
|
"""Return a name,value pair for the next attribute in the stream,
|
|
if one is found, or None"""
|
|
data = self.data
|
|
# Step 1 (skip chars)
|
|
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
|
assert c is None or len(c) == 1
|
|
# Step 2
|
|
if c in (b">", None):
|
|
return None
|
|
# Step 3
|
|
attrName = []
|
|
attrValue = []
|
|
# Step 4 attribute name
|
|
while True:
|
|
if c == b"=" and attrName:
|
|
break
|
|
elif c in spaceCharactersBytes:
|
|
# Step 6!
|
|
c = data.skip()
|
|
break
|
|
elif c in (b"/", b">"):
|
|
return b"".join(attrName), b""
|
|
elif c in asciiUppercaseBytes:
|
|
attrName.append(c.lower())
|
|
elif c is None:
|
|
return None
|
|
else:
|
|
attrName.append(c)
|
|
# Step 5
|
|
c = next(data)
|
|
# Step 7
|
|
if c != b"=":
|
|
data.previous()
|
|
return b"".join(attrName), b""
|
|
# Step 8
|
|
next(data)
|
|
# Step 9
|
|
c = data.skip()
|
|
# Step 10
|
|
if c in (b"'", b'"'):
|
|
# 10.1
|
|
quoteChar = c
|
|
while True:
|
|
# 10.2
|
|
c = next(data)
|
|
# 10.3
|
|
if c == quoteChar:
|
|
next(data)
|
|
return b"".join(attrName), b"".join(attrValue)
|
|
# 10.4
|
|
elif c in asciiUppercaseBytes:
|
|
attrValue.append(c.lower())
|
|
# 10.5
|
|
else:
|
|
attrValue.append(c)
|
|
elif c == b">":
|
|
return b"".join(attrName), b""
|
|
elif c in asciiUppercaseBytes:
|
|
attrValue.append(c.lower())
|
|
elif c is None:
|
|
return None
|
|
else:
|
|
attrValue.append(c)
|
|
# Step 11
|
|
while True:
|
|
c = next(data)
|
|
if c in spacesAngleBrackets:
|
|
return b"".join(attrName), b"".join(attrValue)
|
|
elif c in asciiUppercaseBytes:
|
|
attrValue.append(c.lower())
|
|
elif c is None:
|
|
return None
|
|
else:
|
|
attrValue.append(c)
|
|
|
|
|
|
class ContentAttrParser(object):
|
|
def __init__(self, data):
|
|
assert isinstance(data, bytes)
|
|
self.data = data
|
|
|
|
def parse(self):
|
|
try:
|
|
# Check if the attr name is charset
|
|
# otherwise return
|
|
self.data.jumpTo(b"charset")
|
|
self.data.position += 1
|
|
self.data.skip()
|
|
if not self.data.currentByte == b"=":
|
|
# If there is no = sign keep looking for attrs
|
|
return None
|
|
self.data.position += 1
|
|
self.data.skip()
|
|
# Look for an encoding between matching quote marks
|
|
if self.data.currentByte in (b'"', b"'"):
|
|
quoteMark = self.data.currentByte
|
|
self.data.position += 1
|
|
oldPosition = self.data.position
|
|
if self.data.jumpTo(quoteMark):
|
|
return self.data[oldPosition:self.data.position]
|
|
else:
|
|
return None
|
|
else:
|
|
# Unquoted value
|
|
oldPosition = self.data.position
|
|
try:
|
|
self.data.skipUntil(spaceCharactersBytes)
|
|
return self.data[oldPosition:self.data.position]
|
|
except StopIteration:
|
|
# Return the whole remaining value
|
|
return self.data[oldPosition:]
|
|
except StopIteration:
|
|
return None
|
|
|
|
|
|
def codecName(encoding):
|
|
"""Return the python codec name corresponding to an encoding or None if the
|
|
string doesn't correspond to a valid encoding."""
|
|
if isinstance(encoding, bytes):
|
|
try:
|
|
encoding = encoding.decode("ascii")
|
|
except UnicodeDecodeError:
|
|
return None
|
|
if encoding:
|
|
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
|
return encodings.get(canonicalName, None)
|
|
else:
|
|
return None
|