1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Test tokenizer against html5lib test suite

This commit is contained in:
Nick Wellnhofer 2024-09-08 20:45:48 +02:00
parent 27752f75ca
commit c6af101728
30 changed files with 33254 additions and 0 deletions

View File

@ -0,0 +1,27 @@
The test cases in this directory are derived from the html5lib test
suite available from https://github.com/html5lib/html5lib-tests under
the following license:
---
Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and
other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,96 @@
0
Character
<head>&body;
1
Character
</plaintext>&body;
2
Character
foo
EndTag
xmp
3
Character
foo
EndTag
xmp
4
Character
foo
EndTag
xmp
5
Character
foo
EndTag
xmp
6
Character
foo
7
Character
foo
8
Character
foo</xmp
9
Character
foo</xmp
10
Character
foo
11
Character
foo
12
Character
foo</xmp<
13
Character
foo</xmp<
14
Character
</foo>bar
EndTag
xmp
15
Character
</foo>bar
EndTag
xmp
16
Character
</xmp</xmp
EndTag
xmp
17
Character
</xmp</xmp
EndTag
xmp
18
Character
</foo>bar</xmpaar>
19
Character
</foo>bar</xmpaar>
20
Character
foo
EndTag
xmp
EndTag
baz
21
Character
foo
EndTag
xmp
EndTag
baz
22
Character
&foo;
23
Character
<

View File

@ -0,0 +1,190 @@
0
Comment
?
1
Comment
?
2
Comment
?
3
Character
<EFBFBD>
4
Character
<EFBFBD>
5
Character
<EFBFBD>
6
Character
<EFBFBD>
7
Character
<!--test<73>--><!--test-<2D>--><!--test--<2D>-->
8
Character
<!--<script><3E>--><!--<script>-<2D>--><!--<script>--<2D>-->
9
Character
<!--test
10
Character
<!--test-
11
Character
<!--test--
12
Character
<!--<script>-
13
Character
<!--<script>--
14
Character
<!--<script>
15
Character
<!-- - -->
16
Character
<!-- -< -->
17
Character
<!--test--->
18
Character
<!--
EndTag
script
Character
-->
EndTag
script
19
Character
<!-- <script></script> -->
EndTag
script
20
Character
<!-- <script><script></script>
EndTag
script
Character
-->
EndTag
script
21
Character
<!-- <script>-->
EndTag
script
Character
-->
EndTag
script
22
Character
<!--<scrip>
EndTag
script
Character
-->
23
Character
<!--<script
EndTag
script
Character
-->
24
Character
<!--<script></scrip>-->
25
Character
<!--<script></script-->
26
Character
foobar
27
Character
foobar
28
Character
foobar
29
Character
foobar
30
Character
≂̸
31
Character
&NotEqualTild;
32
EndTag
xmp
33
EndTag
xmp
34
EndTag
xmp
35
Character
</ XMP>
36
Character
</ XMP>
37
Character
</ XMP>
38
Character
</xm>
39
Character
</xm>
40
Character
</xm>
41
Character
</xm
42
Character
</xm
43
Character
</xm
44
Character
</xm/
45
Character
</xm/
46
Character
</xm/
47
StartTag
p id=≂̸
48
Comment
--!<21>
49
DOCTYPE
html
<none>
<none>
50
Comment
[CDATA[foo]]
51
Character
<b>hello world</b>

View File

@ -0,0 +1,240 @@
0
StartTag
h a=&noti;
1
StartTag
h a=&lang=
2
StartTag
h a=&not=
3
StartTag
h a=&noti;
4
StartTag
h a=&lang=
5
StartTag
h a=&not=
6
StartTag
h a=&noti;
7
StartTag
h a=&lang=
8
StartTag
h a=&not=
9
Character
&rrrraannddom;
10
Character
¬i;
11
Character
&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;
12
Character
13
Character
14
Character
15
Character

16
Character
17
Character
ƒ
18
Character
19
Character
20
Character
21
Character
22
Character
ˆ
23
Character
24
Character
Š
25
Character
26
Character
Œ
27
Character

28
Character
Ž
29
Character

30
Character

31
Character
32
Character
33
Character
34
Character
35
Character
36
Character
37
Character
38
Character
˜
39
Character
40
Character
š
41
Character
42
Character
œ
43
Character

44
Character
45
Character

46
Character
47
Character
ƒ
48
Character
49
Character
50
Character
51
Character
52
Character
ˆ
53
Character
54
Character
Š
55
Character
56
Character
Œ
57
Character

58
Character
Ž
59
Character

60
Character

61
Character
62
Character
63
Character
64
Character
65
Character
66
Character
67
Character
68
Character
˜
69
Character
70
Character
š
71
Character
72
Character
œ
73
Character

74
Character
ž
75
Character
Ÿ
76
Character
aa
77
Character
aA
78
Character
af
79
Character
aF

View File

@ -0,0 +1,65 @@
0
Character
foo<!--
EndTag
xmp
Character
-->
EndTag
xmp
1
Character
foo<!--
EndTag
xmp
Character
-->
EndTag
xmp
2
Character
foo<!-->baz
EndTag
xmp
3
Character
foo<!-->baz
EndTag
xmp
4
Character
foo<!-->
EndTag
xmp
Comment
Character
baz
EndTag
xmp
5
Character
foo<!-->
EndTag
xmp
Comment
Character
baz
EndTag
xmp
6
Character
& <!-- & --> &
EndTag
xmp
7
Character
foo<!-- x --x>x-- >x--!>x--<>
EndTag
xmp
8
Character
foo<!-- x --x>x-- >x--!>x--<>
EndTag
xmp

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
0
Comment
-- >

View File

@ -0,0 +1,231 @@
0
DOCTYPE
html
<none>
<none>
1
DOCTYPE
html
<none>
<none>
2
DOCTYPE
html
<none>
<none>
3
DOCTYPE
html
<none>
<none>
4
Comment
DOC
5
DOCTYPE
foo
<none>
<none>
6
StartTag
h
7
8
Character
<>
9
StartTag
h a=b
10
StartTag
h a=b
11
StartTag
h
EndTag
h
12
StartTag
p
Character
One
StartTag
p
Character
Two
13
StartTag
h
EndTag
h
14
StartTag
h a=b c=d
15
StartTag
h a=b c=d
16
StartTag
h a=b
17
Comment
comment
18
Comment
-
19
Comment
--comment
20
Comment
<!
21
Comment
comment
22
Comment
<!
23
Comment
-
24
Comment
25
Comment
26
Comment
27
Comment
<test
28
Comment
<<
29
Comment
<!test
30
Comment
<!-test
31
Comment
<!--test
32
Comment
<<!--test
33
Character
<test-->
34
Character
<!test-->
35
Character
<!-test-->
36
Character
<!--test-->
37
Character
<!-- < test -->
38
Character
<!-- </ test -->
39
Character
<!-- <test> -->
40
Character
<!-- </test> -->
41
Character
<!--<script>-</script>-->
42
Character
<!--<script>--</script>-->
43
Character
<!--<script>---</script>-->
44
Character
<!--<script> - </script>-->
45
Character
<!--<script> -- </script>-->
46
Character
&
47
Character
&&
48
Character
&
49
Character
&f
50
Character
&#
51
Character
&#x
52
Character
I'm ¬it
53
Character
I'm ∉
54
Character
I'm ¬it
55
Character
I'm ¬in
56
Character
I'm &no
57
Character
&¬;
58
Character
$
59
Character
?
60
StartTag
h a=?
EndTag
h
61
StartTag
h a=&notx
62
StartTag
h a=&not1
63
StartTag
h a=&noti
64
StartTag
h a=©
65
StartTag
s o=& t=
66
StartTag
a a=a&
Character
foo
67
StartTag
plaintext
Character
foobar
68
StartTag
a a=f<

View File

@ -0,0 +1,188 @@
0
DOCTYPE
<none>
<none>
<none>
1
DOCTYPE
html
<none>
<none>
2
DOCTYPE
foo
<none>
<none>
3
DOCTYPE
html
-//W3C//DTD HTML Transitional 4.01//EN
<none>
4
DOCTYPE
html
<none>
<none>
5
DOCTYPE
html
<none>
6
DOCTYPE
html
x
<none>
7
DOCTYPE
html
<none>
-//W3C//DTD HTML Transitional 4.01//EN
8
DOCTYPE
html
<none>
-//W3C//DTD HTML Transitional 4.01//EN
9
DOCTYPE
html
-//W3C//DTD HTML Transitional 4.01//EN
-//W3C//DTD HTML Transitional 4.01//EN
10
DOCTYPE
html
<none>
Character
x
11
DOCTYPE
html
<none>
Character
x
12
DOCTYPE
html
foo
Character
x
13
DOCTYPE
html
foo
Character
x
14
DOCTYPE
html
<none>
<none>
15
Character
<EFBFBD>
16
Character
<EFBFBD>
17
Character
<EFBFBD>
18
Character
<EFBFBD>
19
Character
<EFBFBD><EFBFBD>
20
Character
21
Character
&;
22
StartTag
h a=&
23
StartTag
a<b
24
StartTag
h
25
StartTag
br
26
StartTag
br foo=bar
27
StartTag
h a=b
28
StartTag
h a=b
29
Character
</
30
Comment
1
31
Comment
?namespace
32
Comment
?foo--
33
Character
foo < bar
34
Character
<EFBFBD>
35
Comment
-x
36
Character
x
>
37
StartTag
h
38
StartTag
h a= b=
39
StartTag
h a= "=
40
StartTag
h a= '=
41
Character
abc
42
Character
a
StartTag
b
Character
c
43
Character
a
Comment
b
Character
c
44
Character
a
EndTag
b
Character
c

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,278 @@
0
StartTag
z 0= <=
1
StartTag
z x=<
2
StartTag
z z=z=z
3
StartTag
z ==
4
StartTag
z ==
5
StartTag
z ===
6
StartTag
z ====
7
StartTag
z z=&
8
StartTag
z z=&'
9
StartTag
z z=&
10
StartTag
z z=&"
11
StartTag
z z=&xlink_xmlns;
Character
bar
StartTag
z
12
StartTag
z z= foo
Character
bar
StartTag
z
13
StartTag
foo "=bar
14
StartTag
foo '=bar
15
StartTag
foo a"b=bar
16
StartTag
foo a'b=bar
17
StartTag
foo a=b'c
18
StartTag
foo a=b"c
19
StartTag
foo a=b c=
20
StartTag
foo a=b c=
21
StartTag
br a=b
22
StartTag
bar a=b
23
DOCTYPE
html
<none>
<none>
24
25
26
Character
<EFBFBD>
27
Character
<EFBFBD>
28
Character
A
29
Character
A
30
Character
&#x &#X
31
Character
&#xZ
32
Character
&# &#;
33
Character
&#A
34
Character
𐀀
35
Character
􏿿
36
Character
<EFBFBD>
37
Character
<EFBFBD>
38
Character
<EFBFBD>
39
Character
<EFBFBD>
40
Character
<EFBFBD>
41
Character
<EFBFBD>
42
Character
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
43
StartTag
x
44
EndTag
x
45
StartTag
x x=
46
StartTag
x@az[`az{ @az[`az{=
47
StartTag
x x=1
48
EndTag
x
49
EndTag
x
50
StartTag
br
51
StartTag
xr
52
EndTag
br
53
DOCTYPE
html
AbC
XyZ
54
DOCTYPE
html
aBc
xYz
55
DOCTYPE
html
<none>
XyZ
56
DOCTYPE
html
<none>
xYz
57
Comment
doc
Character
<EFBFBD>
58
Comment
doc<EFBFBD>
59
Comment
doc€
60
Comment
doc﷑
61
Comment
doc🿿
62
Character
?
63
Character
64
Character
65
Character
66
Character
67
Character
68
Character
69
Character
text
text
70
DOCTYPE
html
<none>
<none>
71
DOCTYPE
html
<none>
<none>
72
DOCTYPE
html
<none>
<none>
73
DOCTYPE
html
<none>
<none>
74
DOCTYPE
html
<none>
<none>
Character
text
75
StartTag
a a=aa`
76
77
78
79
80
81
82
83
84

View File

@ -0,0 +1,970 @@
0
Character

1
Character

2
Character

3
Character

4
Character

5
Character

6
Character

7
Character

8
Character
9
Character

10
Character

11
Character

12
Character

13
Character

14
Character

15
Character

16
Character

17
Character

18
Character

19
Character

20
Character

21
Character

22
Character

23
Character

24
Character

25
Character

26
Character

27
Character

28
Character
29
Character
30
Character
31
Character
32
Character
33
Character
34
Character
35
Character
36
Character
37
Character
38
Character
39
Character
40
Character
41
Character
42
Character
43
Character
44
Character
45
Character
46
Character
47
Character
48
Character
49
Character
50
Character
51
Character
52
Character
53
Character
54
Character
55
Character
56
Character
57
Character
58
Character
59
Character
60
Character
61
Character
￿
62
Character
🿾
63
Character
🿿
64
Character
𯿾
65
Character
𯿿
66
Character
𿿾
67
Character
𿿿
68
Character
񏿾
69
Character
񏿿
70
Character
񟿾
71
Character
񟿿
72
Character
񯿾
73
Character
񯿿
74
Character
񿿾
75
Character
񿿿
76
Character
򏿾
77
Character
򏿿
78
Character
򟿾
79
Character
򟿿
80
Character
򯿾
81
Character
򯿿
82
Character
򿿾
83
Character
򿿿
84
Character
󏿾
85
Character
󏿿
86
Character
󟿾
87
Character
󟿿
88
Character
󯿾
89
Character
󯿿
90
Character
󿿾
91
Character
󿿿
92
Character
􏿾
93
Character
􏿿
94
Character
95
Character
96
Character
97
Character
!
98
Character
"
99
Character
#
100
Character
$
101
Character
%
102
Character
&
103
Character
'
104
Character
(
105
Character
)
106
Character
*
107
Character
+
108
Character
,
109
Character
-
110
Character
.
111
Character
/
112
Character
0
113
Character
1
114
Character
2
115
Character
3
116
Character
4
117
Character
5
118
Character
6
119
Character
7
120
Character
8
121
Character
9
122
Character
:
123
Character
;
124
Character
=
125
Character
>
126
Character
?
127
Character
@
128
Character
A
129
Character
B
130
Character
C
131
Character
D
132
Character
E
133
Character
F
134
Character
G
135
Character
H
136
Character
I
137
Character
J
138
Character
K
139
Character
L
140
Character
M
141
Character
N
142
Character
O
143
Character
P
144
Character
Q
145
Character
R
146
Character
S
147
Character
T
148
Character
U
149
Character
V
150
Character
W
151
Character
X
152
Character
Y
153
Character
Z
154
Character
[
155
Character
\
156
Character
]
157
Character
^
158
Character
_
159
Character
`
160
Character
a
161
Character
b
162
Character
c
163
Character
d
164
Character
e
165
Character
f
166
Character
g
167
Character
h
168
Character
i
169
Character
j
170
Character
k
171
Character
l
172
Character
m
173
Character
n
174
Character
o
175
Character
p
176
Character
q
177
Character
r
178
Character
s
179
Character
t
180
Character
u
181
Character
v
182
Character
w
183
Character
x
184
Character
y
185
Character
z
186
Character
{
187
Character
|
188
Character
}
189
Character
~
190
Character
 
191
Character
¡
192
Character
¢
193
Character
£
194
Character
¤
195
Character
¥
196
Character
¦
197
Character
§
198
Character
¨
199
Character
©
200
Character
ª
201
Character
«
202
Character
¬
203
Character
­
204
Character
®
205
Character
¯
206
Character
°
207
Character
±
208
Character
²
209
Character
³
210
Character
´
211
Character
µ
212
Character
213
Character
·
214
Character
¸
215
Character
¹
216
Character
º
217
Character
»
218
Character
¼
219
Character
½
220
Character
¾
221
Character
¿
222
Character
À
223
Character
Á
224
Character
Â
225
Character
Ã
226
Character
Ä
227
Character
Å
228
Character
Æ
229
Character
Ç
230
Character
È
231
Character
É
232
Character
Ê
233
Character
Ë
234
Character
Ì
235
Character
Í
236
Character
Î
237
Character
Ï
238
Character
Ð
239
Character
Ñ
240
Character
Ò
241
Character
Ó
242
Character
Ô
243
Character
Õ
244
Character
Ö
245
Character
×
246
Character
Ø
247
Character
Ù
248
Character
Ú
249
Character
Û
250
Character
Ü
251
Character
Ý
252
Character
Þ
253
Character
ß
254
Character
à
255
Character
á
256
Character
â
257
Character
ã
258
Character
ä
259
Character
å
260
Character
æ
261
Character
ç
262
Character
è
263
Character
é
264
Character
ê
265
Character
ë
266
Character
ì
267
Character
í
268
Character
î
269
Character
ï
270
Character
ð
271
Character
ñ
272
Character
ò
273
Character
ó
274
Character
ô
275
Character
õ
276
Character
ö
277
Character
÷
278
Character
ø
279
Character
ù
280
Character
ú
281
Character
û
282
Character
ü
283
Character
ý
284
Character
þ
285
Character
ÿ
286
Character
287
Character
288
Character
289
Character
290
Character
<EFBFBD>
291
Character
𐀀
292
Character
🿽
293
Character
𠀀
294
Character
𯿽
295
Character
𰀀
296
Character
𿿽
297
Character
񀀀
298
Character
񏿽
299
Character
񐀀
300
Character
񟿽
301
Character
񠀀
302
Character
񯿽
303
Character
񰀀
304
Character
񿿽
305
Character
򀀀
306
Character
򏿽
307
Character
򐀀
308
Character
򟿽
309
Character
򠀀
310
Character
򯿽
311
Character
򰀀
312
Character
򿿽
313
Character
󀀀
314
Character
󏿽
315
Character
󐀀
316
Character
󟿽
317
Character
󠀀
318
Character
󯿽
319
Character
󰀀
320
Character
󿿽
321
Character
􀀀
322
Character
􏿽

View File

@ -0,0 +1,4 @@
0
Character
<EFBFBD>

230
runtest.c
View File

@ -1600,6 +1600,231 @@ done:
return(ret); return(ret);
} }
#if defined(LIBXML_HTML_ENABLED) && defined(LIBXML_PUSH_ENABLED)
typedef struct {
int dataState;
int inCharacters;
const xmlChar *startTag;
} xmlTokenizerConfig;
static void
startDocumentTokenizer(void *ctx) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
ctxt->instate = XML_PARSER_CONTENT;
if (config->dataState != 0) {
ctxt->endCheckState = config->dataState;
ctxt->name = config->startTag;
}
}
static void
pendingTokenizer(xmlTokenizerConfig *config) {
if (config->inCharacters) {
fprintf(SAXdebug, "\n");
config->inCharacters = 0;
}
}
static void
internalSubsetTokenizer(void *ctx, const xmlChar *name,
const xmlChar *publicId, const xmlChar *systemId) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
pendingTokenizer(config);
fprintf(SAXdebug, "DOCTYPE\n%s\n%s\n%s\n",
name ? name : BAD_CAST "<none>",
publicId ? publicId : BAD_CAST "<none>",
systemId ? systemId : BAD_CAST "<none>");
}
static void
startElementTokenizer(void *ctx, const xmlChar *name, const xmlChar **atts) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
int i;
pendingTokenizer(config);
fprintf(SAXdebug, "StartTag\n%s", name);
if (atts != NULL) {
for (i = 0; atts[i] != NULL; i += 2) {
fprintf(SAXdebug, " %s=", atts[i]);
if (atts[i+1] != NULL)
fprintf(SAXdebug, "%s", atts[i+1]);
}
}
fprintf(SAXdebug, "\n");
}
static void
endElementTokenizer(void *ctx, const xmlChar *name) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
pendingTokenizer(config);
fprintf(SAXdebug, "EndTag\n%s\n", name);
}
static void
charactersTokenizer(void *ctx, const xmlChar *ch, int len) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
if (!config->inCharacters) {
fprintf(SAXdebug, "Character\n");
config->inCharacters = 1;
}
fwrite(ch, 1, len, SAXdebug);
}
static void
commentTokenizer(void *ctx, const xmlChar *value) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
pendingTokenizer(config);
fprintf(SAXdebug, "Comment\n%s\n", value);
}
static void
endDocumentTokenizer(void *ctx) {
xmlParserCtxtPtr ctxt = ctx;
xmlTokenizerConfig *config = ctxt->_private;
pendingTokenizer(config);
}
static xmlSAXHandler tokenizeHtmlSAXHandler = {
internalSubsetTokenizer,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
startDocumentTokenizer,
endDocumentTokenizer,
startElementTokenizer,
endElementTokenizer,
NULL,
charactersTokenizer,
NULL,
NULL,
commentTokenizer,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
1,
NULL,
NULL,
NULL,
NULL
};
/**
* htmlTokenizerTest:
* @filename: the file to parse
* @result: the file with expected result
* @err: the file with error messages
*
* Parse a file using the SAX API and check for errors.
*
* Returns 0 in case of success, an error code otherwise
*/
static int
htmlTokenizerTest(const char *filename, const char *result,
const char *err ATTRIBUTE_UNUSED,
int options) {
xmlTokenizerConfig config;
char startTag[31];
FILE *input;
char *temp;
unsigned testNum, dataState, size;
int ret = 0, counter = 0;
nb_tests++;
temp = resultFilename(filename, temp_directory, ".res");
if (temp == NULL) {
fprintf(stderr, "out of memory\n");
fatalError();
}
SAXdebug = fopen(temp, "wb");
if (SAXdebug == NULL) {
fprintf(stderr, "Failed to write to %s\n", temp);
free(temp);
return(-1);
}
input = fopen(filename, "rb");
if (input == NULL) {
fprintf(stderr, "%s: failed to open\n", filename);
return(-1);
}
while (fscanf(input, "%u %30s %u %u%*1[\n]",
&testNum, startTag, &dataState, &size) >= 4) {
htmlParserCtxtPtr ctxt;
char *data;
fprintf(SAXdebug, "%d\n", counter++);
data = xmlMalloc(size + 1);
if (fread(data, 1, size, input) != size) {
fprintf(stderr, "%s:%d: unexpected eof\n", filename, counter);
return(-1);
}
ctxt = htmlCreatePushParserCtxt(&tokenizeHtmlSAXHandler, NULL, NULL, 0,
NULL, XML_CHAR_ENCODING_UTF8);
config.dataState = dataState;
config.startTag = BAD_CAST startTag;
config.inCharacters = 0;
ctxt->_private = &config;
htmlCtxtUseOptions(ctxt, options | HTML_PARSE_HTML5);
htmlParseChunk(ctxt, data, size, 1);
htmlFreeParserCtxt(ctxt);
xmlFree(data);
}
if (!feof(input)) {
fprintf(stderr, "%s:%d: invalid format\n", filename, counter);
return(-1);
}
fclose(input);
fclose(SAXdebug);
if (compareFiles(temp, result)) {
fprintf(stderr, "Got a difference for %s\n", filename);
ret = 1;
}
if (temp != NULL) {
unlink(temp);
free(temp);
}
return(ret);
}
#endif /* HTML */
/************************************************************************ /************************************************************************
* * * *
* Parse to tree based tests * * Parse to tree based tests *
@ -4954,6 +5179,11 @@ testDesc testDescriptions[] = {
{ "HTML SAX regression tests" , { "HTML SAX regression tests" ,
saxParseTest, "./test/HTML/*", "result/HTML/", ".sax", NULL, saxParseTest, "./test/HTML/*", "result/HTML/", ".sax", NULL,
XML_PARSE_HTML }, XML_PARSE_HTML },
#ifdef LIBXML_PUSH_ENABLED
{ "HTML tokenization tests",
htmlTokenizerTest,
"./test/html-tokenizer/*.test", "result/html-tokenizer/", "", NULL, 0 },
#endif
#endif #endif
#ifdef LIBXML_VALID_ENABLED #ifdef LIBXML_VALID_ENABLED
{ "Valid documents regression tests" , { "Valid documents regression tests" ,

View File

@ -0,0 +1,27 @@
The test cases in this directory are derived from the html5lib test
suite available from https://github.com/html5lib/html5lib-tests under
the following license:
---
Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and
other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,48 @@
0 plaintext 3 12
<head>&body;
1 plaintext 3 18
</plaintext>&body;
2 xmp 1 9
foo</xmp>
3 xmp 2 9
foo</xmp>
4 xmp 1 9
foo</xMp>
5 xmp 2 9
foo</xMp>
6 xmp 1 9
foo</xmp
7 xmp 2 9
foo</xmp
8 xmp 1 8
foo</xmp
9 xmp 2 8
foo</xmp
10 xmp 1 9
foo</xmp/
11 xmp 2 9
foo</xmp/
12 xmp 1 9
foo</xmp<
13 xmp 2 9
foo</xmp<
14 xmp 1 15
</foo>bar</xmp>
15 xmp 2 15
</foo>bar</xmp>
16 xmp 1 16
</xmp</xmp</xmp>
17 xmp 2 16
</xmp</xmp</xmp>
18 xmp 1 18
</foo>bar</xmpaar>
19 xmp 2 18
</foo>bar</xmpaar>
20 xmp 1 15
foo</xmp></baz>
21 xmp 2 15
foo</xmp></baz>
22 xmp 2 5
&foo;
23 textarea 1 4
&lt;

Binary file not shown.

View File

@ -0,0 +1,160 @@
0 - 0 14
<h a="&noti;">
1 - 0 14
<h a="&lang=">
2 - 0 13
<h a="&not=">
3 - 0 14
<h a='&noti;'>
4 - 0 14
<h a='&lang='>
5 - 0 13
<h a='&not='>
6 - 0 12
<h a=&noti;>
7 - 0 12
<h a=&lang=>
8 - 0 11
<h a=&not=>
9 - 0 14
&rrrraannddom;
10 - 0 6
&noti;
11 - 0 950
&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;
12 - 0 6
&#013;
13 - 0 7
&#x00D;
14 - 0 7
&#0128;
15 - 0 7
&#0129;
16 - 0 7
&#0130;
17 - 0 7
&#0131;
18 - 0 7
&#0132;
19 - 0 7
&#0133;
20 - 0 7
&#0134;
21 - 0 7
&#0135;
22 - 0 7
&#0136;
23 - 0 7
&#0137;
24 - 0 7
&#0138;
25 - 0 7
&#0139;
26 - 0 7
&#0140;
27 - 0 7
&#0141;
28 - 0 7
&#0142;
29 - 0 7
&#0143;
30 - 0 7
&#0144;
31 - 0 7
&#0145;
32 - 0 7
&#0146;
33 - 0 7
&#0147;
34 - 0 7
&#0148;
35 - 0 7
&#0149;
36 - 0 7
&#0150;
37 - 0 7
&#0151;
38 - 0 7
&#0152;
39 - 0 7
&#0153;
40 - 0 7
&#0154;
41 - 0 7
&#0155;
42 - 0 7
&#0156;
43 - 0 7
&#0157;
44 - 0 7
&#x080;
45 - 0 7
&#x081;
46 - 0 7
&#x082;
47 - 0 7
&#x083;
48 - 0 7
&#x084;
49 - 0 7
&#x085;
50 - 0 7
&#x086;
51 - 0 7
&#x087;
52 - 0 7
&#x088;
53 - 0 7
&#x089;
54 - 0 7
&#x08A;
55 - 0 7
&#x08B;
56 - 0 7
&#x08C;
57 - 0 7
&#x08D;
58 - 0 7
&#x08E;
59 - 0 7
&#x08F;
60 - 0 7
&#x090;
61 - 0 7
&#x091;
62 - 0 7
&#x092;
63 - 0 7
&#x093;
64 - 0 7
&#x094;
65 - 0 7
&#x095;
66 - 0 7
&#x096;
67 - 0 7
&#x097;
68 - 0 7
&#x098;
69 - 0 7
&#x099;
70 - 0 7
&#x09A;
71 - 0 7
&#x09B;
72 - 0 7
&#x09C;
73 - 0 7
&#x09D;
74 - 0 7
&#x09E;
75 - 0 7
&#x09F;
76 - 0 5
&#97a
77 - 0 5
&#97A
78 - 0 5
&#97f
79 - 0 5
&#97F

View File

@ -0,0 +1,18 @@
0 xmp 1 22
foo<!--</xmp>--></xmp>
1 xmp 2 22
foo<!--</xmp>--></xmp>
2 xmp 1 17
foo<!-->baz</xmp>
3 xmp 2 17
foo<!-->baz</xmp>
4 xmp 1 28
foo<!--></xmp><!-->baz</xmp>
5 xmp 2 28
foo<!--></xmp><!-->baz</xmp>
6 xmp 1 34
&amp; <!-- &amp; --> &amp; </xmp>
7 xmp 1 35
foo<!-- x --x>x-- >x--!>x--<></xmp>
8 xmp 2 35
foo<!-- x --x>x-- >x--!>x--<></xmp>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,672 @@
0 - 0 13
&#11111111111
1 - 0 12
&#1111111111
2 - 0 14
&#111111111111
3 - 0 14
&#11111111111x
4 - 0 13
&#1111111111x
5 - 0 15
&#111111111111x
6 - 0 14
&#11111111111;
7 - 0 13
&#1111111111;
8 - 0 15
&#111111111111;
9 - 0 8
&#x0000;
10 - 0 8
&#x0001;
11 - 0 8
&#x0002;
12 - 0 8
&#x0003;
13 - 0 8
&#x0004;
14 - 0 8
&#x0005;
15 - 0 8
&#x0006;
16 - 0 8
&#x0007;
17 - 0 8
&#x0008;
18 - 0 8
&#x000b;
19 - 0 8
&#x000e;
20 - 0 8
&#x000f;
21 - 0 8
&#x0010;
22 - 0 8
&#x0011;
23 - 0 8
&#x0012;
24 - 0 8
&#x0013;
25 - 0 8
&#x0014;
26 - 0 8
&#x0015;
27 - 0 8
&#x0016;
28 - 0 8
&#x0017;
29 - 0 8
&#x0018;
30 - 0 8
&#x0019;
31 - 0 8
&#x001a;
32 - 0 8
&#x001b;
33 - 0 8
&#x001c;
34 - 0 8
&#x001d;
35 - 0 8
&#x001e;
36 - 0 8
&#x001f;
37 - 0 8
&#x007f;
38 - 0 8
&#xd800;
39 - 0 8
&#xdfff;
40 - 0 8
&#xfdd0;
41 - 0 8
&#xfdd1;
42 - 0 8
&#xfdd2;
43 - 0 8
&#xfdd3;
44 - 0 8
&#xfdd4;
45 - 0 8
&#xfdd5;
46 - 0 8
&#xfdd6;
47 - 0 8
&#xfdd7;
48 - 0 8
&#xfdd8;
49 - 0 8
&#xfdd9;
50 - 0 8
&#xfdda;
51 - 0 8
&#xfddb;
52 - 0 8
&#xfddc;
53 - 0 8
&#xfddd;
54 - 0 8
&#xfdde;
55 - 0 8
&#xfddf;
56 - 0 8
&#xfde0;
57 - 0 8
&#xfde1;
58 - 0 8
&#xfde2;
59 - 0 8
&#xfde3;
60 - 0 8
&#xfde4;
61 - 0 8
&#xfde5;
62 - 0 8
&#xfde6;
63 - 0 8
&#xfde7;
64 - 0 8
&#xfde8;
65 - 0 8
&#xfde9;
66 - 0 8
&#xfdea;
67 - 0 8
&#xfdeb;
68 - 0 8
&#xfdec;
69 - 0 8
&#xfded;
70 - 0 8
&#xfdee;
71 - 0 8
&#xfdef;
72 - 0 8
&#xfffe;
73 - 0 8
&#xffff;
74 - 0 9
&#x1fffe;
75 - 0 9
&#x1ffff;
76 - 0 9
&#x2fffe;
77 - 0 9
&#x2ffff;
78 - 0 9
&#x3fffe;
79 - 0 9
&#x3ffff;
80 - 0 9
&#x4fffe;
81 - 0 9
&#x4ffff;
82 - 0 9
&#x5fffe;
83 - 0 9
&#x5ffff;
84 - 0 9
&#x6fffe;
85 - 0 9
&#x6ffff;
86 - 0 9
&#x7fffe;
87 - 0 9
&#x7ffff;
88 - 0 9
&#x8fffe;
89 - 0 9
&#x8ffff;
90 - 0 9
&#x9fffe;
91 - 0 9
&#x9ffff;
92 - 0 9
&#xafffe;
93 - 0 9
&#xaffff;
94 - 0 9
&#xbfffe;
95 - 0 9
&#xbffff;
96 - 0 9
&#xcfffe;
97 - 0 9
&#xcffff;
98 - 0 9
&#xdfffe;
99 - 0 9
&#xdffff;
100 - 0 9
&#xefffe;
101 - 0 9
&#xeffff;
102 - 0 9
&#xffffe;
103 - 0 9
&#xfffff;
104 - 0 10
&#x10fffe;
105 - 0 10
&#x10ffff;
106 - 0 8
&#x0009;
107 - 0 8
&#x000a;
108 - 0 8
&#x0020;
109 - 0 8
&#x0021;
110 - 0 8
&#x0022;
111 - 0 8
&#x0023;
112 - 0 8
&#x0024;
113 - 0 8
&#x0025;
114 - 0 8
&#x0026;
115 - 0 8
&#x0027;
116 - 0 8
&#x0028;
117 - 0 8
&#x0029;
118 - 0 8
&#x002a;
119 - 0 8
&#x002b;
120 - 0 8
&#x002c;
121 - 0 8
&#x002d;
122 - 0 8
&#x002e;
123 - 0 8
&#x002f;
124 - 0 8
&#x0030;
125 - 0 8
&#x0031;
126 - 0 8
&#x0032;
127 - 0 8
&#x0033;
128 - 0 8
&#x0034;
129 - 0 8
&#x0035;
130 - 0 8
&#x0036;
131 - 0 8
&#x0037;
132 - 0 8
&#x0038;
133 - 0 8
&#x0039;
134 - 0 8
&#x003a;
135 - 0 8
&#x003b;
136 - 0 8
&#x003c;
137 - 0 8
&#x003d;
138 - 0 8
&#x003e;
139 - 0 8
&#x003f;
140 - 0 8
&#x0040;
141 - 0 8
&#x0041;
142 - 0 8
&#x0042;
143 - 0 8
&#x0043;
144 - 0 8
&#x0044;
145 - 0 8
&#x0045;
146 - 0 8
&#x0046;
147 - 0 8
&#x0047;
148 - 0 8
&#x0048;
149 - 0 8
&#x0049;
150 - 0 8
&#x004a;
151 - 0 8
&#x004b;
152 - 0 8
&#x004c;
153 - 0 8
&#x004d;
154 - 0 8
&#x004e;
155 - 0 8
&#x004f;
156 - 0 8
&#x0050;
157 - 0 8
&#x0051;
158 - 0 8
&#x0052;
159 - 0 8
&#x0053;
160 - 0 8
&#x0054;
161 - 0 8
&#x0055;
162 - 0 8
&#x0056;
163 - 0 8
&#x0057;
164 - 0 8
&#x0058;
165 - 0 8
&#x0059;
166 - 0 8
&#x005a;
167 - 0 8
&#x005b;
168 - 0 8
&#x005c;
169 - 0 8
&#x005d;
170 - 0 8
&#x005e;
171 - 0 8
&#x005f;
172 - 0 8
&#x0060;
173 - 0 8
&#x0061;
174 - 0 8
&#x0062;
175 - 0 8
&#x0063;
176 - 0 8
&#x0064;
177 - 0 8
&#x0065;
178 - 0 8
&#x0066;
179 - 0 8
&#x0067;
180 - 0 8
&#x0068;
181 - 0 8
&#x0069;
182 - 0 8
&#x006a;
183 - 0 8
&#x006b;
184 - 0 8
&#x006c;
185 - 0 8
&#x006d;
186 - 0 8
&#x006e;
187 - 0 8
&#x006f;
188 - 0 8
&#x0070;
189 - 0 8
&#x0071;
190 - 0 8
&#x0072;
191 - 0 8
&#x0073;
192 - 0 8
&#x0074;
193 - 0 8
&#x0075;
194 - 0 8
&#x0076;
195 - 0 8
&#x0077;
196 - 0 8
&#x0078;
197 - 0 8
&#x0079;
198 - 0 8
&#x007a;
199 - 0 8
&#x007b;
200 - 0 8
&#x007c;
201 - 0 8
&#x007d;
202 - 0 8
&#x007e;
203 - 0 8
&#x00a0;
204 - 0 8
&#x00a1;
205 - 0 8
&#x00a2;
206 - 0 8
&#x00a3;
207 - 0 8
&#x00a4;
208 - 0 8
&#x00a5;
209 - 0 8
&#x00a6;
210 - 0 8
&#x00a7;
211 - 0 8
&#x00a8;
212 - 0 8
&#x00a9;
213 - 0 8
&#x00aa;
214 - 0 8
&#x00ab;
215 - 0 8
&#x00ac;
216 - 0 8
&#x00ad;
217 - 0 8
&#x00ae;
218 - 0 8
&#x00af;
219 - 0 8
&#x00b0;
220 - 0 8
&#x00b1;
221 - 0 8
&#x00b2;
222 - 0 8
&#x00b3;
223 - 0 8
&#x00b4;
224 - 0 8
&#x00b5;
225 - 0 8
&#x00b6;
226 - 0 8
&#x00b7;
227 - 0 8
&#x00b8;
228 - 0 8
&#x00b9;
229 - 0 8
&#x00ba;
230 - 0 8
&#x00bb;
231 - 0 8
&#x00bc;
232 - 0 8
&#x00bd;
233 - 0 8
&#x00be;
234 - 0 8
&#x00bf;
235 - 0 8
&#x00c0;
236 - 0 8
&#x00c1;
237 - 0 8
&#x00c2;
238 - 0 8
&#x00c3;
239 - 0 8
&#x00c4;
240 - 0 8
&#x00c5;
241 - 0 8
&#x00c6;
242 - 0 8
&#x00c7;
243 - 0 8
&#x00c8;
244 - 0 8
&#x00c9;
245 - 0 8
&#x00ca;
246 - 0 8
&#x00cb;
247 - 0 8
&#x00cc;
248 - 0 8
&#x00cd;
249 - 0 8
&#x00ce;
250 - 0 8
&#x00cf;
251 - 0 8
&#x00d0;
252 - 0 8
&#x00d1;
253 - 0 8
&#x00d2;
254 - 0 8
&#x00d3;
255 - 0 8
&#x00d4;
256 - 0 8
&#x00d5;
257 - 0 8
&#x00d6;
258 - 0 8
&#x00d7;
259 - 0 8
&#x00d8;
260 - 0 8
&#x00d9;
261 - 0 8
&#x00da;
262 - 0 8
&#x00db;
263 - 0 8
&#x00dc;
264 - 0 8
&#x00dd;
265 - 0 8
&#x00de;
266 - 0 8
&#x00df;
267 - 0 8
&#x00e0;
268 - 0 8
&#x00e1;
269 - 0 8
&#x00e2;
270 - 0 8
&#x00e3;
271 - 0 8
&#x00e4;
272 - 0 8
&#x00e5;
273 - 0 8
&#x00e6;
274 - 0 8
&#x00e7;
275 - 0 8
&#x00e8;
276 - 0 8
&#x00e9;
277 - 0 8
&#x00ea;
278 - 0 8
&#x00eb;
279 - 0 8
&#x00ec;
280 - 0 8
&#x00ed;
281 - 0 8
&#x00ee;
282 - 0 8
&#x00ef;
283 - 0 8
&#x00f0;
284 - 0 8
&#x00f1;
285 - 0 8
&#x00f2;
286 - 0 8
&#x00f3;
287 - 0 8
&#x00f4;
288 - 0 8
&#x00f5;
289 - 0 8
&#x00f6;
290 - 0 8
&#x00f7;
291 - 0 8
&#x00f8;
292 - 0 8
&#x00f9;
293 - 0 8
&#x00fa;
294 - 0 8
&#x00fb;
295 - 0 8
&#x00fc;
296 - 0 8
&#x00fd;
297 - 0 8
&#x00fe;
298 - 0 8
&#x00ff;
299 - 0 8
&#xd7ff;
300 - 0 8
&#xe000;
301 - 0 8
&#xfdcf;
302 - 0 8
&#xfdf0;
303 - 0 8
&#xfffd;
304 - 0 9
&#x10000;
305 - 0 9
&#x1fffd;
306 - 0 9
&#x20000;
307 - 0 9
&#x2fffd;
308 - 0 9
&#x30000;
309 - 0 9
&#x3fffd;
310 - 0 9
&#x40000;
311 - 0 9
&#x4fffd;
312 - 0 9
&#x50000;
313 - 0 9
&#x5fffd;
314 - 0 9
&#x60000;
315 - 0 9
&#x6fffd;
316 - 0 9
&#x70000;
317 - 0 9
&#x7fffd;
318 - 0 9
&#x80000;
319 - 0 9
&#x8fffd;
320 - 0 9
&#x90000;
321 - 0 9
&#x9fffd;
322 - 0 9
&#xa0000;
323 - 0 9
&#xafffd;
324 - 0 9
&#xb0000;
325 - 0 9
&#xbfffd;
326 - 0 9
&#xc0000;
327 - 0 9
&#xcfffd;
328 - 0 9
&#xd0000;
329 - 0 9
&#xdfffd;
330 - 0 9
&#xe0000;
331 - 0 9
&#xefffd;
332 - 0 9
&#xf0000;
333 - 0 9
&#xffffd;
334 - 0 10
&#x100000;
335 - 0 10
&#x10fffd;

View File

@ -0,0 +1,2 @@
0 - 0 8
<!---- >

View File

@ -0,0 +1,138 @@
0 - 0 15
<!DOCTYPE html>
1 - 0 15
<!DOCTYPE HTML>
2 - 0 15
<!DOCTYPE HtMl>
3 - 0 14
<!DOCTYPE HtMl
4 - 0 6
<!DOC>
5 - 0 14
<!DOCTYPE foo>
6 - 0 3
<h>
7 - 0 3
</>
8 - 0 2
<>
9 - 0 9
<h a='b'>
10 - 0 7
<h a=b>
11 - 0 7
<h></h>
12 - 0 12
<p>One<p>Two
13 - 0 13
<h></h a='b'>
14 - 0 15
<h a='b' c='d'>
15 - 0 14
<h a='b'c='d'>
16 - 0 15
<h a='b' a='d'>
17 - 0 14
<!--comment-->
18 - 0 8
<!----->
19 - 0 18
<!-- --comment -->
20 - 0 9
<!--<!-->
21 - 0 11
<!--comment
22 - 0 9
<!-- <!--
23 - 0 3
<!-
24 - 0 5
<!-->
25 - 0 6
<!--->
26 - 0 7
<!---->
27 - 0 13
<!-- <test-->
28 - 0 9
<!--<<-->
29 - 0 14
<!-- <!test-->
30 - 0 15
<!-- <!-test-->
31 - 0 16
<!-- <!--test-->
32 - 0 17
<!-- <<!--test-->
33 - 4 8
<test-->
34 - 4 9
<!test-->
35 - 4 10
<!-test-->
36 - 4 11
<!--test-->
37 - 4 15
<!-- < test -->
38 - 4 16
<!-- </ test -->
39 - 4 15
<!-- <test> -->
40 - 4 16
<!-- </test> -->
41 - 4 25
<!--<script>-</script>-->
42 - 4 26
<!--<script>--</script>-->
43 - 4 27
<!--<script>---</script>-->
44 - 4 27
<!--<script> - </script>-->
45 - 4 28
<!--<script> -- </script>-->
46 - 0 1
&
47 - 0 2
&&
48 - 0 2
&
49 - 0 2
&f
50 - 0 2
&#
51 - 0 3
&#x
52 - 0 11
I'm &not;it
53 - 0 11
I'm &notin;
54 - 0 10
I'm &notit
55 - 0 10
I'm &notin
56 - 0 7
I'm &no
57 - 0 4
&¬;
58 - 0 7
&#0036;
59 - 0 6
&#x3f;
60 - 0 18
<h a='&#x3f;'></h>
61 - 0 13
<h a='&notx'>
62 - 0 13
<h a='&not1'>
63 - 0 13
<h a='&noti'>
64 - 0 13
<h a='&COPY'>
65 - 0 9
<s o=& t>
66 - 0 11
<a a=a&>foo
67 - 0 17
<plaintext>foobar
68 - 0 8
<a a=f<>

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,647 @@
0 - 0 1

1 - 0 1

2 - 0 1

3 - 0 1

4 - 0 1

5 - 0 1

6 - 0 1

7 - 0 1

8 - 0 1
9 - 0 1

10 - 0 1

11 - 0 1

12 - 0 1

13 - 0 1

14 - 0 1

15 - 0 1

16 - 0 1

17 - 0 1

18 - 0 1

19 - 0 1

20 - 0 1

21 - 0 1

22 - 0 1

23 - 0 1

24 - 0 1

25 - 0 1

26 - 0 1

27 - 0 1

28 - 0 3
29 - 0 3
30 - 0 3
31 - 0 3
32 - 0 3
33 - 0 3
34 - 0 3
35 - 0 3
36 - 0 3
37 - 0 3
38 - 0 3
39 - 0 3
40 - 0 3
41 - 0 3
42 - 0 3
43 - 0 3
44 - 0 3
45 - 0 3
46 - 0 3
47 - 0 3
48 - 0 3
49 - 0 3
50 - 0 3
51 - 0 3
52 - 0 3
53 - 0 3
54 - 0 3
55 - 0 3
56 - 0 3
57 - 0 3
58 - 0 3
59 - 0 3
60 - 0 3
61 - 0 3
￿
62 - 0 4
🿾
63 - 0 4
🿿
64 - 0 4
𯿾
65 - 0 4
𯿿
66 - 0 4
𿿾
67 - 0 4
𿿿
68 - 0 4
񏿾
69 - 0 4
񏿿
70 - 0 4
񟿾
71 - 0 4
񟿿
72 - 0 4
񯿾
73 - 0 4
񯿿
74 - 0 4
񿿾
75 - 0 4
񿿿
76 - 0 4
򏿾
77 - 0 4
򏿿
78 - 0 4
򟿾
79 - 0 4
򟿿
80 - 0 4
򯿾
81 - 0 4
򯿿
82 - 0 4
򿿾
83 - 0 4
򿿿
84 - 0 4
󏿾
85 - 0 4
󏿿
86 - 0 4
󟿾
87 - 0 4
󟿿
88 - 0 4
󯿾
89 - 0 4
󯿿
90 - 0 4
󿿾
91 - 0 4
󿿿
92 - 0 4
􏿾
93 - 0 4
􏿿
94 - 0 1
95 - 0 1
96 - 0 1
97 - 0 1
!
98 - 0 1
"
99 - 0 1
#
100 - 0 1
$
101 - 0 1
%
102 - 0 1
&
103 - 0 1
'
104 - 0 1
(
105 - 0 1
)
106 - 0 1
*
107 - 0 1
+
108 - 0 1
,
109 - 0 1
-
110 - 0 1
.
111 - 0 1
/
112 - 0 1
0
113 - 0 1
1
114 - 0 1
2
115 - 0 1
3
116 - 0 1
4
117 - 0 1
5
118 - 0 1
6
119 - 0 1
7
120 - 0 1
8
121 - 0 1
9
122 - 0 1
:
123 - 0 1
;
124 - 0 1
=
125 - 0 1
>
126 - 0 1
?
127 - 0 1
@
128 - 0 1
A
129 - 0 1
B
130 - 0 1
C
131 - 0 1
D
132 - 0 1
E
133 - 0 1
F
134 - 0 1
G
135 - 0 1
H
136 - 0 1
I
137 - 0 1
J
138 - 0 1
K
139 - 0 1
L
140 - 0 1
M
141 - 0 1
N
142 - 0 1
O
143 - 0 1
P
144 - 0 1
Q
145 - 0 1
R
146 - 0 1
S
147 - 0 1
T
148 - 0 1
U
149 - 0 1
V
150 - 0 1
W
151 - 0 1
X
152 - 0 1
Y
153 - 0 1
Z
154 - 0 1
[
155 - 0 1
\
156 - 0 1
]
157 - 0 1
^
158 - 0 1
_
159 - 0 1
`
160 - 0 1
a
161 - 0 1
b
162 - 0 1
c
163 - 0 1
d
164 - 0 1
e
165 - 0 1
f
166 - 0 1
g
167 - 0 1
h
168 - 0 1
i
169 - 0 1
j
170 - 0 1
k
171 - 0 1
l
172 - 0 1
m
173 - 0 1
n
174 - 0 1
o
175 - 0 1
p
176 - 0 1
q
177 - 0 1
r
178 - 0 1
s
179 - 0 1
t
180 - 0 1
u
181 - 0 1
v
182 - 0 1
w
183 - 0 1
x
184 - 0 1
y
185 - 0 1
z
186 - 0 1
{
187 - 0 1
|
188 - 0 1
}
189 - 0 1
~
190 - 0 2
 
191 - 0 2
¡
192 - 0 2
¢
193 - 0 2
£
194 - 0 2
¤
195 - 0 2
¥
196 - 0 2
¦
197 - 0 2
§
198 - 0 2
¨
199 - 0 2
©
200 - 0 2
ª
201 - 0 2
«
202 - 0 2
¬
203 - 0 2
­
204 - 0 2
®
205 - 0 2
¯
206 - 0 2
°
207 - 0 2
±
208 - 0 2
²
209 - 0 2
³
210 - 0 2
´
211 - 0 2
µ
212 - 0 2
213 - 0 2
·
214 - 0 2
¸
215 - 0 2
¹
216 - 0 2
º
217 - 0 2
»
218 - 0 2
¼
219 - 0 2
½
220 - 0 2
¾
221 - 0 2
¿
222 - 0 2
À
223 - 0 2
Á
224 - 0 2
Â
225 - 0 2
Ã
226 - 0 2
Ä
227 - 0 2
Å
228 - 0 2
Æ
229 - 0 2
Ç
230 - 0 2
È
231 - 0 2
É
232 - 0 2
Ê
233 - 0 2
Ë
234 - 0 2
Ì
235 - 0 2
Í
236 - 0 2
Î
237 - 0 2
Ï
238 - 0 2
Ð
239 - 0 2
Ñ
240 - 0 2
Ò
241 - 0 2
Ó
242 - 0 2
Ô
243 - 0 2
Õ
244 - 0 2
Ö
245 - 0 2
×
246 - 0 2
Ø
247 - 0 2
Ù
248 - 0 2
Ú
249 - 0 2
Û
250 - 0 2
Ü
251 - 0 2
Ý
252 - 0 2
Þ
253 - 0 2
ß
254 - 0 2
à
255 - 0 2
á
256 - 0 2
â
257 - 0 2
ã
258 - 0 2
ä
259 - 0 2
å
260 - 0 2
æ
261 - 0 2
ç
262 - 0 2
è
263 - 0 2
é
264 - 0 2
ê
265 - 0 2
ë
266 - 0 2
ì
267 - 0 2
í
268 - 0 2
î
269 - 0 2
ï
270 - 0 2
ð
271 - 0 2
ñ
272 - 0 2
ò
273 - 0 2
ó
274 - 0 2
ô
275 - 0 2
õ
276 - 0 2
ö
277 - 0 2
÷
278 - 0 2
ø
279 - 0 2
ù
280 - 0 2
ú
281 - 0 2
û
282 - 0 2
ü
283 - 0 2
ý
284 - 0 2
þ
285 - 0 2
ÿ
286 - 0 3
287 - 0 3
288 - 0 3
289 - 0 3
290 - 0 3
<EFBFBD>
291 - 0 4
𐀀
292 - 0 4
🿽
293 - 0 4
𠀀
294 - 0 4
𯿽
295 - 0 4
𰀀
296 - 0 4
𿿽
297 - 0 4
񀀀
298 - 0 4
񏿽
299 - 0 4
񐀀
300 - 0 4
񟿽
301 - 0 4
񠀀
302 - 0 4
񯿽
303 - 0 4
񰀀
304 - 0 4
񿿽
305 - 0 4
򀀀
306 - 0 4
򏿽
307 - 0 4
򐀀
308 - 0 4
򟿽
309 - 0 4
򠀀
310 - 0 4
򯿽
311 - 0 4
򰀀
312 - 0 4
򿿽
313 - 0 4
󀀀
314 - 0 4
󏿽
315 - 0 4
󐀀
316 - 0 4
󟿽
317 - 0 4
󠀀
318 - 0 4
󯿽
319 - 0 4
󰀀
320 - 0 4
󿿽
321 - 0 4
􀀀
322 - 0 4
􏿽

Binary file not shown.

86
tools/genHtml5LibTests.py Normal file
View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
import glob
import json
import re
state_map = {
'Data state': 0,
'RCDATA state': 1,
'RAWTEXT state': 2,
'PLAINTEXT state': 3,
'Script data state': 4,
'CDATA section state': 5,
}
for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
match = re.search('/([^/]*).test$', filename)
if match is None:
continue
testname = match[1]
if testname == 'xmlViolation':
continue
with open(filename) as json_data:
root = json.load(json_data)
test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
counter = 0
for tests in root.values():
for test in tests:
input = test['input']
# Skip surrogate tests
if re.search(r'\\uD[89A-F]', input, re.I):
continue
input = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
input)
output = ''
for token in test['output']:
output += token[0] + '\n'
if token[0] == 'DOCTYPE':
for i in range(1, 4):
if token[i] is None:
output += '<none>\n'
else:
output += token[i] + '\n'
else:
output += token[1]
if token[0] == 'StartTag':
for name, value in token[2].items():
output += f' {name}={value}'
output += '\n'
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
output)
output = re.sub(r'\x00', '\uFFFD', output)
for state in test.get('initialStates', ['Data state']):
state_no = state_map.get(state)
if state_no is None:
raise Exception(f'{filename}: unknown state: {state}')
if state_no == 5:
continue
start_tag = test.get('lastStartTag', '-')
test_out.write(f'{counter} {start_tag} {state_no} '
f'{len(input.encode())}\n')
test_out.write(input)
test_out.write('\n')
result_out.write(f'{counter}\n')
result_out.write(output)
counter += 1
test_out.close()
result_out.close()