Include html5lib for BeautifulSoup

BeautifulSoup needs lxml or html5, have included html5lib. Also latest BeautifulSoup 4.1.3
2026-07-16 05:54:01 +01:00 · 2012-09-06 10:47:07 +12:00
parent 7aac60cecd
commit b8c5782765
136 changed files with 87265 additions and 2428 deletions
@@ -0,0 +1,75 @@
+{"tests": [
+
+{"description":"PLAINTEXT content model flag",
+"initialStates":["PLAINTEXT state"],
+"lastStartTag":"plaintext",
+"input":"<head>&body;",
+"output":[["Character", "<head>&body;"]]},
+
+{"description":"End tag closing RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp>",
+"output":[["Character", "foo"], ["EndTag", "xmp"]]},
+
+{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xMp>",
+"output":[["Character", "foo"], ["EndTag", "xmp"]]},
+
+{"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp ",
+"output":[["Character", "foo"], "ParseError"]},
+
+{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp",
+"output":[["Character", "foo</xmp"]]},
+
+{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp/",
+"output":[["Character", "foo"], "ParseError"]},
+
+{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp<",
+"output":[["Character", "foo</xmp<"]]},
+
+{"description":"End tag with incorrect name in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"</foo>bar</xmp>",
+"output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
+
+{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"</foo>bar</xmpaar>",
+"output":[["Character", "</foo>bar</xmpaar>"]]},
+
+{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo</xmp></baz>",
+"output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
+
+{"description":"RAWTEXT w/ something looking like an entity",
+"initialStates":["RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"&foo;",
+"output":[["Character", "&foo;"]]},
+
+{"description":"RCDATA w/ an entity",
+"initialStates":["RCDATA state"],
+"lastStartTag":"textarea",
+"input":"&lt;",
+"output":[["Character", "<"]]}
+
+]}
@@ -0,0 +1,90 @@
+{
+    "tests": [
+        {
+            "description":"CR in bogus comment state",
+            "input":"<?\u000d",
+            "output":["ParseError", ["Comment", "?\u000a"]]
+        },
+        {
+            "description":"CRLF in bogus comment state",
+            "input":"<?\u000d\u000a",
+            "output":["ParseError", ["Comment", "?\u000a"]]
+        },
+        {
+            "description":"NUL in RCDATA and RAWTEXT",
+            "doubleEscaped":true,
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "input":"\\u0000",
+            "output":["ParseError", ["Character", "\\uFFFD"]]
+        },
+        {
+            "description":"skip first BOM but not later ones",
+            "input":"\uFEFFfoo\uFEFFbar",
+            "output":[["Character", "foo\uFEFFbar"]]
+        },
+        {
+            "description":"Non BMP-charref in in RCDATA",
+            "initialStates":["RCDATA state"],
+            "input":"&NotEqualTilde;",
+            "output":[["Character", "\u2242\u0338"]]
+        },
+        {
+            "description":"Bad charref in in RCDATA",
+            "initialStates":["RCDATA state"],
+            "input":"&NotEqualTild;",
+            "output":["ParseError", ["Character", "&NotEqualTild;"]]
+        },
+        {
+            "description":"lowercase endtags in RCDATA and RAWTEXT",
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "lastStartTag":"xmp",
+            "input":"</XMP>",
+            "output":[["EndTag","xmp"]]
+        },
+        {
+            "description":"bad endtag in RCDATA and RAWTEXT",
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "lastStartTag":"xmp",
+            "input":"</ XMP>",
+            "output":[["Character","</ XMP>"]]
+        },
+        {
+            "description":"bad endtag in RCDATA and RAWTEXT",
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "lastStartTag":"xmp",
+            "input":"</xm>",
+            "output":[["Character","</xm>"]]
+        },
+        {
+            "description":"bad endtag in RCDATA and RAWTEXT",
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "lastStartTag":"xmp",
+            "input":"</xm ",
+            "output":[["Character","</xm "]]
+        },
+        {
+            "description":"bad endtag in RCDATA and RAWTEXT",
+            "initialStates":["RCDATA state", "RAWTEXT state"],
+            "lastStartTag":"xmp",
+            "input":"</xm/",
+            "output":[["Character","</xm/"]]
+        },
+        {
+            "description":"Non BMP-charref in attribute",
+            "input":"<p id=\"&NotEqualTilde;\">",
+            "output":[["StartTag", "p", {"id":"\u2242\u0338"}]]
+        },
+        {
+            "description":"--!NUL in comment ",
+            "doubleEscaped":true,
+            "input":"<!----!\\u0000-->",
+            "output":["ParseError", ["Comment", "--!\\uFFFD"]]
+        },
+        {
+            "description":"space EOF after doctype ",
+            "input":"<!DOCTYPE html ",
+            "output":["ParseError", ["DOCTYPE", "html", null, null , false]]
+        }
+
+    ]
+}
@@ -0,0 +1,283 @@
+{"tests": [
+
+{"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.",
+"input":"<h a='&noti;'>",
+"output": ["ParseError", ["StartTag", "h", {"a": "&noti;"}]]},
+
+{"description": "Entity name followed by the equals sign in an attribute value.",
+"input":"<h a='&lang='>",
+"output": ["ParseError", ["StartTag", "h", {"a": "&lang="}]]},
+
+{"description": "CR as numeric entity",
+"input":"&#013;",
+"output": ["ParseError", ["Character", "\r"]]},
+
+{"description": "CR as hexadecimal numeric entity",
+"input":"&#x00D;",
+"output": ["ParseError", ["Character", "\r"]]},
+
+{"description": "Windows-1252 EURO SIGN numeric entity.",
+"input":"&#0128;",
+"output": ["ParseError", ["Character", "\u20AC"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
+"input":"&#0129;",
+"output": ["ParseError", ["Character", "\u0081"]]},
+
+{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK numeric entity.",
+"input":"&#0130;",
+"output": ["ParseError", ["Character", "\u201A"]]},
+
+{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK numeric entity.",
+"input":"&#0131;",
+"output": ["ParseError", ["Character", "\u0192"]]},
+
+{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK numeric entity.",
+"input":"&#0132;",
+"output": ["ParseError", ["Character", "\u201E"]]},
+
+{"description": "Windows-1252 HORIZONTAL ELLIPSIS numeric entity.",
+"input":"&#0133;",
+"output": ["ParseError", ["Character", "\u2026"]]},
+
+{"description": "Windows-1252 DAGGER numeric entity.",
+"input":"&#0134;",
+"output": ["ParseError", ["Character", "\u2020"]]},
+
+{"description": "Windows-1252 DOUBLE DAGGER numeric entity.",
+"input":"&#0135;",
+"output": ["ParseError", ["Character", "\u2021"]]},
+
+{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT numeric entity.",
+"input":"&#0136;",
+"output": ["ParseError", ["Character", "\u02C6"]]},
+
+{"description": "Windows-1252 PER MILLE SIGN numeric entity.",
+"input":"&#0137;",
+"output": ["ParseError", ["Character", "\u2030"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON numeric entity.",
+"input":"&#0138;",
+"output": ["ParseError", ["Character", "\u0160"]]},
+
+{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK numeric entity.",
+"input":"&#0139;",
+"output": ["ParseError", ["Character", "\u2039"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE numeric entity.",
+"input":"&#0140;",
+"output": ["ParseError", ["Character", "\u0152"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
+"input":"&#0141;",
+"output": ["ParseError", ["Character", "\u008D"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON numeric entity.",
+"input":"&#0142;",
+"output": ["ParseError", ["Character", "\u017D"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
+"input":"&#0143;",
+"output": ["ParseError", ["Character", "\u008F"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
+"input":"&#0144;",
+"output": ["ParseError", ["Character", "\u0090"]]},
+
+{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK numeric entity.",
+"input":"&#0145;",
+"output": ["ParseError", ["Character", "\u2018"]]},
+
+{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK numeric entity.",
+"input":"&#0146;",
+"output": ["ParseError", ["Character", "\u2019"]]},
+
+{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK numeric entity.",
+"input":"&#0147;",
+"output": ["ParseError", ["Character", "\u201C"]]},
+
+{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK numeric entity.",
+"input":"&#0148;",
+"output": ["ParseError", ["Character", "\u201D"]]},
+
+{"description": "Windows-1252 BULLET numeric entity.",
+"input":"&#0149;",
+"output": ["ParseError", ["Character", "\u2022"]]},
+
+{"description": "Windows-1252 EN DASH numeric entity.",
+"input":"&#0150;",
+"output": ["ParseError", ["Character", "\u2013"]]},
+
+{"description": "Windows-1252 EM DASH numeric entity.",
+"input":"&#0151;",
+"output": ["ParseError", ["Character", "\u2014"]]},
+
+{"description": "Windows-1252 SMALL TILDE numeric entity.",
+"input":"&#0152;",
+"output": ["ParseError", ["Character", "\u02DC"]]},
+
+{"description": "Windows-1252 TRADE MARK SIGN numeric entity.",
+"input":"&#0153;",
+"output": ["ParseError", ["Character", "\u2122"]]},
+
+{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON numeric entity.",
+"input":"&#0154;",
+"output": ["ParseError", ["Character", "\u0161"]]},
+
+{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK numeric entity.",
+"input":"&#0155;",
+"output": ["ParseError", ["Character", "\u203A"]]},
+
+{"description": "Windows-1252 LATIN SMALL LIGATURE OE numeric entity.",
+"input":"&#0156;",
+"output": ["ParseError", ["Character", "\u0153"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
+"input":"&#0157;",
+"output": ["ParseError", ["Character", "\u009D"]]},
+
+{"description": "Windows-1252 EURO SIGN hexadecimal numeric entity.",
+"input":"&#x080;",
+"output": ["ParseError", ["Character", "\u20AC"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
+"input":"&#x081;",
+"output": ["ParseError", ["Character", "\u0081"]]},
+
+{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x082;",
+"output": ["ParseError", ["Character", "\u201A"]]},
+
+{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK hexadecimal numeric entity.",
+"input":"&#x083;",
+"output": ["ParseError", ["Character", "\u0192"]]},
+
+{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x084;",
+"output": ["ParseError", ["Character", "\u201E"]]},
+
+{"description": "Windows-1252 HORIZONTAL ELLIPSIS hexadecimal numeric entity.",
+"input":"&#x085;",
+"output": ["ParseError", ["Character", "\u2026"]]},
+
+{"description": "Windows-1252 DAGGER hexadecimal numeric entity.",
+"input":"&#x086;",
+"output": ["ParseError", ["Character", "\u2020"]]},
+
+{"description": "Windows-1252 DOUBLE DAGGER hexadecimal numeric entity.",
+"input":"&#x087;",
+"output": ["ParseError", ["Character", "\u2021"]]},
+
+{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT hexadecimal numeric entity.",
+"input":"&#x088;",
+"output": ["ParseError", ["Character", "\u02C6"]]},
+
+{"description": "Windows-1252 PER MILLE SIGN hexadecimal numeric entity.",
+"input":"&#x089;",
+"output": ["ParseError", ["Character", "\u2030"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON hexadecimal numeric entity.",
+"input":"&#x08A;",
+"output": ["ParseError", ["Character", "\u0160"]]},
+
+{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x08B;",
+"output": ["ParseError", ["Character", "\u2039"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE hexadecimal numeric entity.",
+"input":"&#x08C;",
+"output": ["ParseError", ["Character", "\u0152"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
+"input":"&#x08D;",
+"output": ["ParseError", ["Character", "\u008D"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON hexadecimal numeric entity.",
+"input":"&#x08E;",
+"output": ["ParseError", ["Character", "\u017D"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
+"input":"&#x08F;",
+"output": ["ParseError", ["Character", "\u008F"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
+"input":"&#x090;",
+"output": ["ParseError", ["Character", "\u0090"]]},
+
+{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x091;",
+"output": ["ParseError", ["Character", "\u2018"]]},
+
+{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x092;",
+"output": ["ParseError", ["Character", "\u2019"]]},
+
+{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x093;",
+"output": ["ParseError", ["Character", "\u201C"]]},
+
+{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x094;",
+"output": ["ParseError", ["Character", "\u201D"]]},
+
+{"description": "Windows-1252 BULLET hexadecimal numeric entity.",
+"input":"&#x095;",
+"output": ["ParseError", ["Character", "\u2022"]]},
+
+{"description": "Windows-1252 EN DASH hexadecimal numeric entity.",
+"input":"&#x096;",
+"output": ["ParseError", ["Character", "\u2013"]]},
+
+{"description": "Windows-1252 EM DASH hexadecimal numeric entity.",
+"input":"&#x097;",
+"output": ["ParseError", ["Character", "\u2014"]]},
+
+{"description": "Windows-1252 SMALL TILDE hexadecimal numeric entity.",
+"input":"&#x098;",
+"output": ["ParseError", ["Character", "\u02DC"]]},
+
+{"description": "Windows-1252 TRADE MARK SIGN hexadecimal numeric entity.",
+"input":"&#x099;",
+"output": ["ParseError", ["Character", "\u2122"]]},
+
+{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON hexadecimal numeric entity.",
+"input":"&#x09A;",
+"output": ["ParseError", ["Character", "\u0161"]]},
+
+{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
+"input":"&#x09B;",
+"output": ["ParseError", ["Character", "\u203A"]]},
+
+{"description": "Windows-1252 LATIN SMALL LIGATURE OE hexadecimal numeric entity.",
+"input":"&#x09C;",
+"output": ["ParseError", ["Character", "\u0153"]]},
+
+{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
+"input":"&#x09D;",
+"output": ["ParseError", ["Character", "\u009D"]]},
+
+{"description": "Windows-1252 LATIN SMALL LETTER Z WITH CARON hexadecimal numeric entity.",
+"input":"&#x09E;",
+"output": ["ParseError", ["Character", "\u017E"]]},
+
+{"description": "Windows-1252 LATIN CAPITAL LETTER Y WITH DIAERESIS hexadecimal numeric entity.",
+"input":"&#x09F;",
+"output": ["ParseError", ["Character", "\u0178"]]},
+
+{"description": "Decimal numeric entity followed by hex character a.",
+"input":"&#97a",
+"output": ["ParseError", ["Character", "aa"]]},
+
+{"description": "Decimal numeric entity followed by hex character A.",
+"input":"&#97A",
+"output": ["ParseError", ["Character", "aA"]]},
+
+{"description": "Decimal numeric entity followed by hex character f.",
+"input":"&#97f",
+"output": ["ParseError", ["Character", "af"]]},
+
+{"description": "Decimal numeric entity followed by hex character A.",
+"input":"&#97F",
+"output": ["ParseError", ["Character", "aF"]]}
+
+]}
@@ -0,0 +1,33 @@
+{"tests": [
+
+{"description":"Commented close tag in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo<!--</xmp>--></xmp>",
+"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
+
+{"description":"Bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo<!-->baz</xmp>",
+"output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
+
+{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo<!--></xmp><!-->baz</xmp>",
+"output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
+
+{"description":"Commented entities in RCDATA",
+"initialStates":["RCDATA state"],
+"lastStartTag":"xmp",
+"input":" &amp; <!-- &amp; --> &amp; </xmp>",
+"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
+
+{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
+"lastStartTag":"xmp",
+"input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
+"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
+
+]}
@@ -0,0 +1,7 @@
+{"tests": [
+
+{"description":"<!---- >",
+"input":"<!---- >",
+"output":["ParseError", "ParseError", ["Comment","-- >"]]}
+
+]}
@@ -0,0 +1,196 @@
+{"tests": [
+
+{"description":"Correct Doctype lowercase",
+"input":"<!DOCTYPE html>",
+"output":[["DOCTYPE", "html", null, null, true]]},
+
+{"description":"Correct Doctype uppercase",
+"input":"<!DOCTYPE HTML>",
+"output":[["DOCTYPE", "html", null, null, true]]},
+
+{"description":"Correct Doctype mixed case",
+"input":"<!DOCTYPE HtMl>", 
+"output":[["DOCTYPE", "html", null, null, true]]},
+
+{"description":"Correct Doctype case with EOF",
+"input":"<!DOCTYPE HtMl", 
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Truncated doctype start",
+"input":"<!DOC>", 
+"output":["ParseError", ["Comment", "DOC"]]},
+
+{"description":"Doctype in error",
+"input":"<!DOCTYPE foo>", 
+"output":[["DOCTYPE", "foo", null, null, true]]},
+
+{"description":"Single Start Tag",
+"input":"<h>",
+"output":[["StartTag", "h", {}]]},
+
+{"description":"Empty end tag",
+"input":"</>",
+"output":["ParseError"]},
+
+{"description":"Empty start tag",
+"input":"<>",
+"output":["ParseError", ["Character", "<>"]]},
+
+{"description":"Start Tag w/attribute",
+"input":"<h a='b'>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start Tag w/attribute no quotes",
+"input":"<h a=b>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start/End Tag",
+"input":"<h></h>",
+"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
+
+{"description":"Two unclosed start tags",
+"input":"<p>One<p>Two",
+"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
+
+{"description":"End Tag w/attribute",
+"input":"<h></h a='b'>",
+"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
+
+{"description":"Multiple atts",
+"input":"<h a='b' c='d'>",
+"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Multiple atts no space",
+"input":"<h a='b'c='d'>",
+"output":["ParseError", ["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Repeated attr",
+ "input":"<h a='b' a='d'>",
+ "output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Simple comment",
+ "input":"<!--comment-->",
+ "output":[["Comment", "comment"]]},
+
+{"description":"Comment, Central dash no space",
+ "input":"<!----->",
+ "output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Comment, two central dashes",
+"input":"<!-- --comment -->",
+"output":["ParseError", ["Comment", " --comment "]]},
+
+{"description":"Unfinished comment",
+"input":"<!--comment",
+"output":["ParseError", ["Comment", "comment"]]},
+
+{"description":"Start of a comment",
+"input":"<!-",
+"output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Short comment",
+ "input":"<!-->",
+ "output":["ParseError", ["Comment", ""]]},
+
+{"description":"Short comment two",
+ "input":"<!--->",
+ "output":["ParseError", ["Comment", ""]]},
+
+{"description":"Short comment three",
+ "input":"<!---->",
+ "output":[["Comment", ""]]},
+
+
+{"description":"Ampersand EOF",
+"input":"&",
+"output":[["Character", "&"]]},
+
+{"description":"Ampersand ampersand EOF",
+"input":"&&",
+"output":[["Character", "&&"]]},
+
+{"description":"Ampersand space EOF",
+"input":"& ",
+"output":[["Character", "& "]]},
+
+{"description":"Unfinished entity",
+"input":"&f",
+"output":["ParseError", ["Character", "&f"]]},
+
+{"description":"Ampersand, number sign",
+"input":"&#",
+"output":["ParseError", ["Character", "&#"]]},
+
+{"description":"Unfinished numeric entity",
+"input":"&#x",
+"output":["ParseError", ["Character", "&#x"]]},
+
+{"description":"Entity with trailing semicolon (1)",
+"input":"I'm &not;it",
+"output":[["Character","I'm \u00ACit"]]},
+
+{"description":"Entity with trailing semicolon (2)",
+"input":"I'm &notin;",
+"output":[["Character","I'm \u2209"]]},
+
+{"description":"Entity without trailing semicolon (1)",
+"input":"I'm &notit",
+"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACit"]]},
+
+{"description":"Entity without trailing semicolon (2)",
+"input":"I'm &notin",
+"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACin"]]},
+
+{"description":"Partial entity match at end of file",
+"input":"I'm &no",
+"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
+
+{"description":"Non-ASCII character reference name",
+"input":"&\u00AC;",
+"output":["ParseError", ["Character", "&\u00AC;"]]},
+
+{"description":"ASCII decimal entity",
+"input":"&#0036;",
+"output":[["Character","$"]]},
+
+{"description":"ASCII hexadecimal entity",
+"input":"&#x3f;",
+"output":[["Character","?"]]},
+
+{"description":"Hexadecimal entity in attribute",
+"input":"<h a='&#x3f;'></h>",
+"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
+
+{"description":"Entity in attribute without semicolon ending in x",
+"input":"<h a='&notx'>",
+"output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
+
+{"description":"Entity in attribute without semicolon ending in 1",
+"input":"<h a='&not1'>",
+"output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
+
+{"description":"Entity in attribute without semicolon ending in i",
+"input":"<h a='&noti'>",
+"output":["ParseError", ["StartTag", "h", {"a":"&noti"}]]},
+
+{"description":"Entity in attribute without semicolon",
+"input":"<h a='&COPY'>",
+"output":["ParseError", ["StartTag", "h", {"a":"\u00A9"}]]},
+
+{"description":"Unquoted attribute ending in ampersand",
+"input":"<s o=& t>",
+"output":[["StartTag","s",{"o":"&","t":""}]]},
+
+{"description":"Unquoted attribute at end of tag with final character of &, with tag followed by characters",
+"input":"<a a=a&>foo",
+"output":[["StartTag", "a", {"a":"a&"}], ["Character", "foo"]]},
+
+{"description":"plaintext element",
+ "input":"<plaintext>foobar",
+ "output":[["StartTag","plaintext",{}], ["Character","foobar"]]},
+
+{"description":"Open angled bracket in unquoted attribute value state",
+ "input":"<a a=f<>",
+ "output":["ParseError", ["StartTag", "a", {"a":"f<"}]]}
+
+]}
@@ -0,0 +1,179 @@
+{"tests": [
+
+{"description":"DOCTYPE without name",
+"input":"<!DOCTYPE>",
+"output":["ParseError", "ParseError", ["DOCTYPE", "", null, null, false]]},
+
+{"description":"DOCTYPE without space before name",
+"input":"<!DOCTYPEhtml>",
+"output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
+
+{"description":"Incorrect DOCTYPE without a space before name",
+"input":"<!DOCTYPEfoo>",
+"output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
+
+{"description":"DOCTYPE with publicId",
+"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
+"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
+
+{"description":"DOCTYPE with EOF after PUBLIC",
+"input":"<!DOCTYPE html PUBLIC",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"DOCTYPE with EOF after PUBLIC '",
+"input":"<!DOCTYPE html PUBLIC '",
+"output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
+
+{"description":"DOCTYPE with EOF after PUBLIC 'x",
+"input":"<!DOCTYPE html PUBLIC 'x",
+"output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
+
+{"description":"DOCTYPE with systemId",
+"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
+"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
+
+{"description":"DOCTYPE with publicId and systemId",
+"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
+"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
+
+{"description":"DOCTYPE with > in double-quoted publicId",
+"input":"<!DOCTYPE html PUBLIC \">x",
+"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
+
+{"description":"DOCTYPE with > in single-quoted publicId",
+"input":"<!DOCTYPE html PUBLIC '>x",
+"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
+
+{"description":"DOCTYPE with > in double-quoted systemId",
+"input":"<!DOCTYPE html PUBLIC \"foo\" \">x",
+"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
+
+{"description":"DOCTYPE with > in single-quoted systemId",
+"input":"<!DOCTYPE html PUBLIC 'foo' '>x",
+"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
+
+{"description":"Incomplete doctype",
+"input":"<!DOCTYPE html ",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Numeric entity representing the NUL character",
+"input":"&#0000;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing the NUL character",
+"input":"&#x0000;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#2225222;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#x1010FFFF;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity pair representing a surrogate pair",
+"input":"&#xD869;&#xDED6;",
+"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity with mixed uppercase and lowercase",
+"input":"&#xaBcD;",
+"output":[["Character", "\uABCD"]]},
+
+{"description":"Entity without a name",
+"input":"&;",
+"output":["ParseError", ["Character", "&;"]]},
+
+{"description":"Unescaped ampersand in attribute value",
+"input":"<h a='&'>",
+"output":[["StartTag", "h", { "a":"&" }]]},
+
+{"description":"StartTag containing <",
+"input":"<a<b>",
+"output":[["StartTag", "a<b", { }]]},
+
+{"description":"Non-void element containing trailing /",
+"input":"<h/>",
+"output":[["StartTag","h",{},true]]},
+
+{"description":"Void element with permitted slash",
+"input":"<br/>",
+"output":[["StartTag","br",{},true]]},
+
+{"description":"Void element with permitted slash (with attribute)",
+"input":"<br foo='bar'/>",
+"output":[["StartTag","br",{"foo":"bar"},true]]},
+
+{"description":"StartTag containing /",
+"input":"<h/a='b'>",
+"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Double-quoted attribute value",
+"input":"<h a=\"b\">",
+"output":[["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Unescaped </",
+"input":"</",
+"output":["ParseError", ["Character", "</"]]},
+
+{"description":"Illegal end tag name",
+"input":"</1>",
+"output":["ParseError", ["Comment", "1"]]},
+
+{"description":"Simili processing instruction",
+"input":"<?namespace>",
+"output":["ParseError", ["Comment", "?namespace"]]},
+
+{"description":"A bogus comment stops at >, even if preceeded by two dashes",
+"input":"<?foo-->",
+"output":["ParseError", ["Comment", "?foo--"]]},
+
+{"description":"Unescaped <",
+"input":"foo < bar",
+"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
+
+{"description":"Null Byte Replacement",
+"input":"\u0000",
+"output":["ParseError", ["Character", "\u0000"]]},
+
+{"description":"Comment with dash",
+"input":"<!---x",
+"output":["ParseError", ["Comment", "-x"]]},
+
+{"description":"Entity + newline",
+"input":"\nx\n&gt;\n",
+"output":[["Character","\nx\n>\n"]]},
+
+{"description":"Start tag with no attributes but space before the greater-than sign",
+"input":"<h >",
+"output":[["StartTag", "h", {}]]},
+
+{"description":"Empty attribute followed by uppercase attribute",
+"input":"<h a B=''>",
+"output":[["StartTag", "h", {"a":"", "b":""}]]},
+
+{"description":"Double-quote after attribute name",
+"input":"<h a \">",
+"output":["ParseError", ["StartTag", "h", {"a":"", "\"":""}]]},
+
+{"description":"Single-quote after attribute name",
+"input":"<h a '>",
+"output":["ParseError", ["StartTag", "h", {"a":"", "'":""}]]},
+
+{"description":"Empty end tag with following characters",
+"input":"a</>bc",
+"output":[["Character", "a"], "ParseError", ["Character", "bc"]]},
+
+{"description":"Empty end tag with following tag",
+"input":"a</><b>c",
+"output":[["Character", "a"], "ParseError", ["StartTag", "b", {}], ["Character", "c"]]},
+
+{"description":"Empty end tag with following comment",
+"input":"a</><!--b-->c",
+"output":[["Character", "a"], "ParseError", ["Comment", "b"], ["Character", "c"]]},
+
+{"description":"Empty end tag with following end tag",
+"input":"a</></b>c",
+"output":[["Character", "a"], "ParseError", ["EndTag", "b"], ["Character", "c"]]}
+
+]}
@@ -0,0 +1,344 @@
+{"tests": [
+
+{"description":"< in attribute name",
+"input":"<z/0  <>",
+"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
+
+{"description":"< in attribute value",
+"input":"<z x=<>",
+"output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
+
+{"description":"= in unquoted attribute value",
+"input":"<z z=z=z>",
+"output":["ParseError", ["StartTag", "z", {"z": "z=z"}]]},
+
+{"description":"= attribute",
+"input":"<z =>",
+"output":["ParseError", ["StartTag", "z", {"=": ""}]]},
+
+{"description":"== attribute",
+"input":"<z ==>",
+"output":["ParseError", "ParseError", ["StartTag", "z", {"=": ""}]]},
+
+{"description":"=== attribute",
+"input":"<z ===>",
+"output":["ParseError", "ParseError", ["StartTag", "z", {"=": "="}]]},
+
+{"description":"==== attribute",
+"input":"<z ====>",
+"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"=": "=="}]]},
+
+{"description":"Allowed \" after ampersand in attribute value",
+"input":"<z z=\"&\">",
+"output":[["StartTag", "z", {"z": "&"}]]},
+
+{"description":"Non-allowed ' after ampersand in attribute value",
+"input":"<z z=\"&'\">",
+"output":["ParseError", ["StartTag", "z", {"z": "&'"}]]},
+
+{"description":"Allowed ' after ampersand in attribute value",
+"input":"<z z='&'>",
+"output":[["StartTag", "z", {"z": "&"}]]},
+
+{"description":"Non-allowed \" after ampersand in attribute value",
+"input":"<z z='&\"'>",
+"output":["ParseError", ["StartTag", "z", {"z": "&\""}]]},
+
+{"description":"Text after bogus character reference",
+"input":"<z z='&xlink_xmlns;'>bar<z>",
+"output":["ParseError",["StartTag","z",{"z":"&xlink_xmlns;"}],["Character","bar"],["StartTag","z",{}]]},
+
+{"description":"Text after hex character reference",
+"input":"<z z='&#x0020; foo'>bar<z>",
+"output":[["StartTag","z",{"z":"  foo"}],["Character","bar"],["StartTag","z",{}]]},
+
+{"description":"Attribute name starting with \"",
+"input":"<foo \"='bar'>",
+"output":["ParseError", ["StartTag", "foo", {"\"": "bar"}]]},
+
+{"description":"Attribute name starting with '",
+"input":"<foo '='bar'>",
+"output":["ParseError", ["StartTag", "foo", {"'": "bar"}]]},
+
+{"description":"Attribute name containing \"",
+"input":"<foo a\"b='bar'>",
+"output":["ParseError", ["StartTag", "foo", {"a\"b": "bar"}]]},
+
+{"description":"Attribute name containing '",
+"input":"<foo a'b='bar'>",
+"output":["ParseError", ["StartTag", "foo", {"a'b": "bar"}]]},
+
+{"description":"Unquoted attribute value containing '",
+"input":"<foo a=b'c>",
+"output":["ParseError", ["StartTag", "foo", {"a": "b'c"}]]},
+
+{"description":"Unquoted attribute value containing \"",
+"input":"<foo a=b\"c>",
+"output":["ParseError", ["StartTag", "foo", {"a": "b\"c"}]]},
+
+{"description":"Double-quoted attribute value not followed by whitespace",
+"input":"<foo a=\"b\"c>",
+"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
+
+{"description":"Single-quoted attribute value not followed by whitespace",
+"input":"<foo a='b'c>",
+"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
+
+{"description":"Quoted attribute followed by permitted /",
+"input":"<br a='b'/>",
+"output":[["StartTag","br",{"a":"b"},true]]},
+
+{"description":"Quoted attribute followed by non-permitted /",
+"input":"<bar a='b'/>",
+"output":[["StartTag","bar",{"a":"b"},true]]},
+
+{"description":"CR EOF after doctype name",
+"input":"<!doctype html \r",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"CR EOF in tag name",
+"input":"<z\r",
+"output":["ParseError"]},
+
+{"description":"Slash EOF in tag name",
+"input":"<z/",
+"output":["ParseError"]},
+
+{"description":"Zero hex numeric entity",
+"input":"&#x0",
+"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Zero decimal numeric entity",
+"input":"&#0",
+"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Zero-prefixed hex numeric entity",
+"input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
+"output":[["Character", "A"]]},
+
+{"description":"Zero-prefixed decimal numeric entity",
+"input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
+"output":[["Character", "A"]]},
+
+{"description":"Empty hex numeric entities",
+"input":"&#x &#X ",
+"output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
+
+{"description":"Empty decimal numeric entities",
+"input":"&# &#; ",
+"output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
+
+{"description":"Non-BMP numeric entity",
+"input":"&#x10000;",
+"output":[["Character", "\uD800\uDC00"]]},
+
+{"description":"Maximum non-BMP numeric entity",
+"input":"&#X10FFFF;",
+"output":["ParseError", ["Character", "\uDBFF\uDFFF"]]},
+
+{"description":"Above maximum numeric entity",
+"input":"&#x110000;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"32-bit hex numeric entity",
+"input":"&#x80000041;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"33-bit hex numeric entity",
+"input":"&#x100000041;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"33-bit decimal numeric entity",
+"input":"&#4294967361;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"65-bit hex numeric entity",
+"input":"&#x10000000000000041;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"65-bit decimal numeric entity",
+"input":"&#18446744073709551681;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Surrogate code point edge cases",
+"input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
+"output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
+
+{"description":"Uppercase start tag name",
+"input":"<X>",
+"output":[["StartTag", "x", {}]]},
+
+{"description":"Uppercase end tag name",
+"input":"</X>",
+"output":[["EndTag", "x"]]},
+
+{"description":"Uppercase attribute name",
+"input":"<x X>",
+"output":[["StartTag", "x", { "x":"" }]]},
+
+{"description":"Tag/attribute name case edge values",
+"input":"<x@AZ[`az{ @AZ[`az{>",
+"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
+
+{"description":"Duplicate different-case attributes",
+"input":"<x x=1 x=2 X=3>",
+"output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
+
+{"description":"Uppercase close tag attributes",
+"input":"</x X>",
+"output":["ParseError", ["EndTag", "x"]]},
+
+{"description":"Duplicate close tag attributes",
+"input":"</x x x>",
+"output":["ParseError", "ParseError", ["EndTag", "x"]]},
+
+{"description":"Permitted slash",
+"input":"<br/>",
+"output":[["StartTag","br",{},true]]},
+
+{"description":"Non-permitted slash",
+"input":"<xr/>",
+"output":[["StartTag","xr",{},true]]},
+
+{"description":"Permitted slash but in close tag",
+"input":"</br/>",
+"output":["ParseError", ["EndTag", "br"]]},
+
+{"description":"Doctype public case-sensitivity (1)",
+"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
+"output":[["DOCTYPE", "html", "AbC", "XyZ", true]]},
+
+{"description":"Doctype public case-sensitivity (2)",
+"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
+"output":[["DOCTYPE", "html", "aBc", "xYz", true]]},
+
+{"description":"Doctype system case-sensitivity (1)",
+"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
+"output":[["DOCTYPE", "html", null, "XyZ", true]]},
+
+{"description":"Doctype system case-sensitivity (2)",
+"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
+"output":[["DOCTYPE", "html", null, "xYz", true]]},
+
+{"description":"U+0000 in lookahead region after non-matching character",
+"input":"<!doc>\u0000",
+"output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\u0000"]],
+"ignoreErrorOrder":true},
+
+{"description":"U+0000 in lookahead region",
+"input":"<!doc\u0000",
+"output":["ParseError", ["Comment", "doc\uFFFD"]],
+"ignoreErrorOrder":true},
+
+{"description":"U+0080 in lookahead region",
+"input":"<!doc\u0080",
+"output":["ParseError", "ParseError", ["Comment", "doc\u0080"]],
+"ignoreErrorOrder":true},
+
+{"description":"U+FDD1 in lookahead region",
+"input":"<!doc\uFDD1",
+"output":["ParseError", "ParseError", ["Comment", "doc\uFDD1"]],
+"ignoreErrorOrder":true},
+
+{"description":"U+1FFFF in lookahead region",
+"input":"<!doc\uD83F\uDFFF",
+"output":["ParseError", "ParseError", ["Comment", "doc\uD83F\uDFFF"]],
+"ignoreErrorOrder":true},
+
+{"description":"CR followed by non-LF",
+"input":"\r?",
+"output":[["Character", "\n?"]]},
+
+{"description":"CR at EOF",
+"input":"\r",
+"output":[["Character", "\n"]]},
+
+{"description":"LF at EOF",
+"input":"\n",
+"output":[["Character", "\n"]]},
+
+{"description":"CR LF",
+"input":"\r\n",
+"output":[["Character", "\n"]]},
+
+{"description":"CR CR",
+"input":"\r\r",
+"output":[["Character", "\n\n"]]},
+
+{"description":"LF LF",
+"input":"\n\n",
+"output":[["Character", "\n\n"]]},
+
+{"description":"LF CR",
+"input":"\n\r",
+"output":[["Character", "\n\n"]]},
+
+{"description":"text CR CR CR text",
+"input":"text\r\r\rtext",
+"output":[["Character", "text\n\n\ntext"]]},
+
+{"description":"Doctype publik",
+"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Doctype publi",
+"input":"<!DOCTYPE html PUBLI",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Doctype sistem",
+"input":"<!DOCTYPE html SISTEM \"AbC\">",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Doctype sys",
+"input":"<!DOCTYPE html SYS",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
+
+{"description":"Doctype html x>text",
+"input":"<!DOCTYPE html x>text",
+"output":["ParseError", ["DOCTYPE", "html", null, null, false], ["Character", "text"]]},
+
+{"description":"Grave accent in unquoted attribute",
+"input":"<a a=aa`>",
+"output":["ParseError", ["StartTag", "a", {"a":"aa`"}]]},
+
+{"description":"EOF in tag name state ",
+"input":"<a",
+"output":["ParseError"]},
+
+{"description":"EOF in tag name state",
+"input":"<a",
+"output":["ParseError"]},
+
+{"description":"EOF in before attribute name state",
+"input":"<a ",
+"output":["ParseError"]},
+
+{"description":"EOF in attribute name state",
+"input":"<a a",
+"output":["ParseError"]},
+
+{"description":"EOF in after attribute name state",
+"input":"<a a ",
+"output":["ParseError"]},
+
+{"description":"EOF in before attribute value state",
+"input":"<a a =",
+"output":["ParseError"]},
+
+{"description":"EOF in attribute value (double quoted) state",
+"input":"<a a =\"a",
+"output":["ParseError"]},
+
+{"description":"EOF in attribute value (single quoted) state",
+"input":"<a a ='a",
+"output":["ParseError"]},
+
+{"description":"EOF in attribute value (unquoted) state",
+"input":"<a a =a",
+"output":["ParseError"]},
+
+{"description":"EOF in after attribute value state",
+"input":"<a a ='a'",
+"output":["ParseError"]}
+
+]}
@@ -0,0 +1,27 @@
+{"tests" : [
+{"description": "Invalid Unicode character U+DFFF",
+"doubleEscaped":true,
+"input": "\\uDFFF",
+"output":["ParseError", ["Character", "\\uFFFD"]]},
+
+{"description": "Invalid Unicode character U+D800",
+"doubleEscaped":true,
+"input": "\\uD800",
+"output":["ParseError", ["Character", "\\uFFFD"]]},
+
+{"description": "Invalid Unicode character U+DFFF with valid preceding character",
+"doubleEscaped":true,
+"input": "a\\uDFFF",
+"output":["ParseError", ["Character", "a\\uFFFD"]]},
+
+{"description": "Invalid Unicode character U+D800 with valid following character",
+"doubleEscaped":true,
+"input": "\\uD800a",
+"output":["ParseError", ["Character", "\\uFFFDa"]]},
+
+{"description":"CR followed by U+0000",
+"input":"\r\u0000",
+"output":[["Character", "\n"], "ParseError", ["Character", "\u0000"]],
+"ignoreErrorOrder":true}
+]
+}
@@ -0,0 +1,22 @@
+{"xmlViolationTests": [
+
+{"description":"Non-XML character",
+"input":"a\uFFFFb",
+"ignoreErrorOrder":true,
+"output":["ParseError",["Character","a\uFFFDb"]]},
+
+{"description":"Non-XML space",
+"input":"a\u000Cb",
+"ignoreErrorOrder":true,
+"output":[["Character","a b"]]},
+
+{"description":"Double hyphen in comment",
+"input":"<!-- foo -- bar -->",
+"output":["ParseError",["Comment"," foo - - bar "]]},
+
+{"description":"FF between attributes",
+"input":"<a b=''\u000Cc=''>",
+"output":[["StartTag","a",{"b":"","c":""}]]}
+]}
+
+