1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00
libxml2/tools/genHtml5LibTests.py
2024-10-06 18:13:05 +02:00

87 lines
2.6 KiB
Python

#!/usr/bin/env python3
import glob
import json
import re
state_map = {
'Data state': 0,
'RCDATA state': 1,
'RAWTEXT state': 2,
'PLAINTEXT state': 3,
'Script data state': 4,
'CDATA section state': 5,
}
for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
match = re.search('/([^/]*).test$', filename)
if match is None:
continue
testname = match[1]
if testname == 'xmlViolation':
continue
with open(filename) as json_data:
root = json.load(json_data)
test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
counter = 0
for tests in root.values():
for test in tests:
input = test['input']
# Skip surrogate tests
if re.search(r'\\uD[89A-F]', input, re.I):
continue
input = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
input)
output = ''
for token in test['output']:
output += token[0] + '\n'
if token[0] == 'DOCTYPE':
for i in range(1, 4):
if token[i] is None:
output += '<none>\n'
else:
output += token[i] + '\n'
else:
output += token[1]
if token[0] == 'StartTag':
for name, value in token[2].items():
output += f' {name}={value}'
output += '\n'
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
output)
output = re.sub(r'\x00', '\uFFFD', output)
for state in test.get('initialStates', ['Data state']):
state_no = state_map.get(state)
if state_no is None:
raise Exception(f'{filename}: unknown state: {state}')
if state_no == 5:
continue
start_tag = test.get('lastStartTag', '-')
test_out.write(f'{counter} {start_tag} {state_no} '
f'{len(input.encode())}\n')
test_out.write(input)
test_out.write('\n')
result_out.write(f'{counter}\n')
result_out.write(output)
counter += 1
test_out.close()
result_out.close()