mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-24 06:50:08 +03:00
html: Adjust xmlDetectEncoding for HTML
Don't check for UTF-32 or EBCDIC. We now perform BOM sniffing and the first step of the HTML5 prescan algorithm (detect UTF-16 XML declarations). The rest of the algorithm still has to be implemented.
This commit is contained in:
parent
227d8f739b
commit
6bb2ea8e70
17
HTMLparser.c
17
HTMLparser.c
@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
||||
xmlDetectEncoding(ctxt);
|
||||
|
||||
/*
|
||||
* This is wrong but matches long-standing behavior. In most cases,
|
||||
* a document starting with an XML declaration will specify UTF-8.
|
||||
* TODO: Implement HTML5 prescan algorithm
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is wrong but matches long-standing behavior. In most
|
||||
* cases, a document starting with an XML declaration will
|
||||
* specify UTF-8. The HTML5 prescan algorithm handles
|
||||
* XML declarations in a better way.
|
||||
*/
|
||||
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
|
||||
@ -4943,10 +4949,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
|
||||
xmlDetectEncoding(ctxt);
|
||||
|
||||
/*
|
||||
* TODO: Implement HTML5 prescan algorithm
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is wrong but matches long-standing behavior. In most
|
||||
* cases, a document starting with an XML declaration will
|
||||
* specify UTF-8.
|
||||
* specify UTF-8. The HTML5 prescan algorithm handles
|
||||
* XML declarations in a better way.
|
||||
*/
|
||||
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
|
||||
|
@ -1454,12 +1454,23 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
|
||||
enc = XML_CHAR_ENCODING_NONE;
|
||||
bomSize = 0;
|
||||
|
||||
/*
|
||||
* BOM sniffing and detection of initial bytes of an XML
|
||||
* declaration.
|
||||
*
|
||||
* The HTML5 spec doesn't cover UTF-32 (UCS-4) or EBCDIC.
|
||||
*/
|
||||
switch (in[0]) {
|
||||
case 0x00:
|
||||
if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
|
||||
if ((!ctxt->html) &&
|
||||
(in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
|
||||
enc = XML_CHAR_ENCODING_UCS4BE;
|
||||
autoFlag = XML_INPUT_AUTO_OTHER;
|
||||
} else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) {
|
||||
/*
|
||||
* TODO: The HTML5 spec requires to check that the
|
||||
* next codepoint is an 'x'.
|
||||
*/
|
||||
enc = XML_CHAR_ENCODING_UTF16BE;
|
||||
autoFlag = XML_INPUT_AUTO_UTF16BE;
|
||||
}
|
||||
@ -1467,10 +1478,15 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
|
||||
|
||||
case 0x3C:
|
||||
if (in[1] == 0x00) {
|
||||
if ((in[2] == 0x00) && (in[3] == 0x00)) {
|
||||
if ((!ctxt->html) &&
|
||||
(in[2] == 0x00) && (in[3] == 0x00)) {
|
||||
enc = XML_CHAR_ENCODING_UCS4LE;
|
||||
autoFlag = XML_INPUT_AUTO_OTHER;
|
||||
} else if ((in[2] == 0x3F) && (in[3] == 0x00)) {
|
||||
/*
|
||||
* TODO: The HTML5 spec requires to check that the
|
||||
* next codepoint is an 'x'.
|
||||
*/
|
||||
enc = XML_CHAR_ENCODING_UTF16LE;
|
||||
autoFlag = XML_INPUT_AUTO_UTF16LE;
|
||||
}
|
||||
@ -1478,7 +1494,8 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
|
||||
break;
|
||||
|
||||
case 0x4C:
|
||||
if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
|
||||
if ((!ctxt->html) &&
|
||||
(in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
|
||||
enc = XML_CHAR_ENCODING_EBCDIC;
|
||||
autoFlag = XML_INPUT_AUTO_OTHER;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user