mirror of
https://github.com/rembo10/headphones.git
synced 2026-05-02 17:59:28 +01:00
Include html5lib for BeautifulSoup
BeautifulSoup needs lxml or html5, have included html5lib. Also latest BeautifulSoup 4.1.3
This commit is contained in:
193
html5lib/tests/test_tokenizer.py
Normal file
193
html5lib/tests/test_tokenizer.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import sys
|
||||
import os
|
||||
import unittest
|
||||
import cStringIO
|
||||
import warnings
|
||||
import re
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
import simplejson as json
|
||||
|
||||
from support import html5lib_test_files
|
||||
from html5lib.tokenizer import HTMLTokenizer
|
||||
from html5lib import constants
|
||||
|
||||
class TokenizerTestParser(object):
|
||||
def __init__(self, initialState, lastStartTag=None):
|
||||
self.tokenizer = HTMLTokenizer
|
||||
self._state = initialState
|
||||
self._lastStartTag = lastStartTag
|
||||
|
||||
def parse(self, stream, encoding=None, innerHTML=False):
|
||||
tokenizer = self.tokenizer(stream, encoding)
|
||||
self.outputTokens = []
|
||||
|
||||
tokenizer.state = getattr(tokenizer, self._state)
|
||||
if self._lastStartTag is not None:
|
||||
tokenizer.currentToken = {"type": "startTag",
|
||||
"name":self._lastStartTag}
|
||||
|
||||
types = dict((v,k) for k,v in constants.tokenTypes.iteritems())
|
||||
for token in tokenizer:
|
||||
getattr(self, 'process%s' % types[token["type"]])(token)
|
||||
|
||||
return self.outputTokens
|
||||
|
||||
def processDoctype(self, token):
|
||||
self.outputTokens.append([u"DOCTYPE", token["name"], token["publicId"],
|
||||
token["systemId"], token["correct"]])
|
||||
|
||||
def processStartTag(self, token):
|
||||
self.outputTokens.append([u"StartTag", token["name"],
|
||||
dict(token["data"][::-1]), token["selfClosing"]])
|
||||
|
||||
def processEmptyTag(self, token):
|
||||
if token["name"] not in constants.voidElements:
|
||||
self.outputTokens.append(u"ParseError")
|
||||
self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
|
||||
|
||||
def processEndTag(self, token):
|
||||
self.outputTokens.append([u"EndTag", token["name"],
|
||||
token["selfClosing"]])
|
||||
|
||||
def processComment(self, token):
|
||||
self.outputTokens.append([u"Comment", token["data"]])
|
||||
|
||||
def processSpaceCharacters(self, token):
|
||||
self.outputTokens.append([u"Character", token["data"]])
|
||||
self.processSpaceCharacters = self.processCharacters
|
||||
|
||||
def processCharacters(self, token):
|
||||
self.outputTokens.append([u"Character", token["data"]])
|
||||
|
||||
def processEOF(self, token):
|
||||
pass
|
||||
|
||||
def processParseError(self, token):
|
||||
self.outputTokens.append([u"ParseError", token["data"]])
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
outputTokens = []
|
||||
for token in tokens:
|
||||
if not "ParseError" in token and token[0] == "Character":
|
||||
if (outputTokens and not "ParseError" in outputTokens[-1] and
|
||||
outputTokens[-1][0] == "Character"):
|
||||
outputTokens[-1][1] += token[1]
|
||||
else:
|
||||
outputTokens.append(token)
|
||||
else:
|
||||
outputTokens.append(token)
|
||||
return outputTokens
|
||||
|
||||
def normalizeTokens(tokens):
|
||||
# TODO: convert tests to reflect arrays
|
||||
for i, token in enumerate(tokens):
|
||||
if token[0] == u'ParseError':
|
||||
tokens[i] = token[0]
|
||||
return tokens
|
||||
|
||||
def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
|
||||
ignoreErrors=False):
|
||||
"""Test whether the test has passed or failed
|
||||
|
||||
If the ignoreErrorOrder flag is set to true we don't test the relative
|
||||
positions of parse errors and non parse errors
|
||||
"""
|
||||
checkSelfClosing= False
|
||||
for token in expectedTokens:
|
||||
if (token[0] == "StartTag" and len(token) == 4
|
||||
or token[0] == "EndTag" and len(token) == 3):
|
||||
checkSelfClosing = True
|
||||
break
|
||||
|
||||
if not checkSelfClosing:
|
||||
for token in receivedTokens:
|
||||
if token[0] == "StartTag" or token[0] == "EndTag":
|
||||
token.pop()
|
||||
|
||||
if not ignoreErrorOrder and not ignoreErrors:
|
||||
return expectedTokens == receivedTokens
|
||||
else:
|
||||
#Sort the tokens into two groups; non-parse errors and parse errors
|
||||
tokens = {"expected":[[],[]], "received":[[],[]]}
|
||||
for tokenType, tokenList in zip(tokens.keys(),
|
||||
(expectedTokens, receivedTokens)):
|
||||
for token in tokenList:
|
||||
if token != "ParseError":
|
||||
tokens[tokenType][0].append(token)
|
||||
else:
|
||||
if not ignoreErrors:
|
||||
tokens[tokenType][1].append(token)
|
||||
return tokens["expected"] == tokens["received"]
|
||||
|
||||
def unescape_test(test):
|
||||
def decode(inp):
|
||||
return inp.decode("unicode-escape")
|
||||
|
||||
test["input"] = decode(test["input"])
|
||||
for token in test["output"]:
|
||||
if token == "ParseError":
|
||||
continue
|
||||
else:
|
||||
token[1] = decode(token[1])
|
||||
if len(token) > 2:
|
||||
for key, value in token[2]:
|
||||
del token[2][key]
|
||||
token[2][decode(key)] = decode(value)
|
||||
return test
|
||||
|
||||
|
||||
def runTokenizerTest(test):
|
||||
#XXX - move this out into the setup function
|
||||
#concatenate all consecutive character tokens into a single token
|
||||
if 'doubleEscaped' in test:
|
||||
test = unescape_test(test)
|
||||
|
||||
expected = concatenateCharacterTokens(test['output'])
|
||||
if 'lastStartTag' not in test:
|
||||
test['lastStartTag'] = None
|
||||
outBuffer = cStringIO.StringIO()
|
||||
stdout = sys.stdout
|
||||
sys.stdout = outBuffer
|
||||
parser = TokenizerTestParser(test['initialState'],
|
||||
test['lastStartTag'])
|
||||
tokens = parser.parse(test['input'])
|
||||
tokens = concatenateCharacterTokens(tokens)
|
||||
received = normalizeTokens(tokens)
|
||||
errorMsg = u"\n".join(["\n\nInitial state:",
|
||||
test['initialState'] ,
|
||||
"\nInput:", unicode(test['input']),
|
||||
"\nExpected:", unicode(expected),
|
||||
"\nreceived:", unicode(tokens)])
|
||||
errorMsg = errorMsg.encode("utf-8")
|
||||
ignoreErrorOrder = test.get('ignoreErrorOrder', False)
|
||||
assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
|
||||
|
||||
|
||||
def _doCapitalize(match):
|
||||
return match.group(1).upper()
|
||||
|
||||
_capitalizeRe = re.compile(r"\W+(\w)").sub
|
||||
|
||||
def capitalize(s):
|
||||
s = s.lower()
|
||||
s = _capitalizeRe(_doCapitalize, s)
|
||||
return s
|
||||
|
||||
|
||||
def test_tokenizer():
|
||||
for filename in html5lib_test_files('tokenizer', '*.test'):
|
||||
tests = json.load(file(filename))
|
||||
testName = os.path.basename(filename).replace(".test","")
|
||||
if 'tests' in tests:
|
||||
for index,test in enumerate(tests['tests']):
|
||||
#Skip tests with a self closing flag
|
||||
skip = False
|
||||
if 'initialStates' not in test:
|
||||
test["initialStates"] = ["Data state"]
|
||||
for initialState in test["initialStates"]:
|
||||
test["initialState"] = capitalize(initialState)
|
||||
yield runTokenizerTest, test
|
||||
|
||||
Reference in New Issue
Block a user