1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-01-26 10:03:34 +03:00

parser: Support custom charset conversion implementations

Implement xmlCtxtSetCharEncConvImpl. I agree that the name is terrible.
This commit is contained in:
Nick Wellnhofer 2024-06-28 00:34:52 +02:00
parent c59c24494d
commit 221df37529
5 changed files with 140 additions and 71 deletions

View File

@ -452,6 +452,9 @@ struct _xmlParserCtxt {
xmlResourceLoader resourceLoader XML_DEPRECATED_MEMBER;
void *resourceCtxt XML_DEPRECATED_MEMBER;
xmlCharEncConvImpl convImpl XML_DEPRECATED_MEMBER;
void *convCtxt XML_DEPRECATED_MEMBER;
};
/**
@ -1438,6 +1441,10 @@ XMLPUBFUN void
xmlCtxtSetResourceLoader(xmlParserCtxtPtr ctxt,
xmlResourceLoader loader,
void *vctxt);
XMLPUBFUN void
xmlCtxtSetCharEncConvImpl(xmlParserCtxtPtr ctxt,
xmlCharEncConvImpl impl,
void *vctxt);
XMLPUBFUN void
xmlCtxtSetMaxAmplification(xmlParserCtxtPtr ctxt,
unsigned maxAmpl);

View File

@ -376,7 +376,8 @@ XMLPUBFUN xmlParserInputPtr
xmlInputCreateIO(const char *url, xmlInputReadCallback ioRead,
xmlInputCloseCallback ioClose, void *ioCtxt, int flags);
XMLPUBFUN int
xmlInputSetEncoding(xmlParserInputPtr input, const char *encoding);
xmlInputSetEncodingHandler(xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler);
/**
* Namespaces.

View File

@ -1078,9 +1078,34 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
* *
************************************************************************/
/**
* xmlCtxtSetCharEncConvImpl:
* @ctxt: parser context
* @impl: callback
* @vctxt: user data
*
* Installs a custom implementation to convert between character
* encodings.
*
* This bypasses legacy feature like global encoding handlers or
* encoding aliases.
*
* Available since 2.14.0.
*/
void
xmlCtxtSetCharEncConvImpl(xmlParserCtxtPtr ctxt, xmlCharEncConvImpl impl,
void *vctxt) {
if (ctxt == NULL)
return;
ctxt->convImpl = impl;
ctxt->convCtxt = vctxt;
}
static int
xmlDetectEBCDIC(xmlParserInputPtr input, xmlCharEncodingHandlerPtr *hout) {
xmlDetectEBCDIC(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr *hout) {
xmlChar out[200];
xmlParserInputPtr input = ctxt->input;
xmlCharEncodingHandlerPtr handler;
int inlen, outlen, res, i;
@ -1088,9 +1113,10 @@ xmlDetectEBCDIC(xmlParserInputPtr input, xmlCharEncodingHandlerPtr *hout) {
/*
* To detect the EBCDIC code page, we convert the first 200 bytes
* to EBCDIC-US and try to find the encoding declaration.
* to IBM037 (EBCDIC-US) and try to find the encoding declaration.
*/
res = xmlLookupCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC, &handler);
res = xmlCreateCharEncodingHandler("IBM037", /* output */ 0,
ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != 0)
return(res);
outlen = sizeof(out) - 1;
@ -1133,8 +1159,9 @@ xmlDetectEBCDIC(xmlParserInputPtr input, xmlCharEncodingHandlerPtr *hout) {
break;
out[i] = 0;
xmlCharEncCloseFunc(handler);
res = xmlOpenCharEncodingHandler((char *) out + start,
/* output */ 0, &handler);
res = xmlCreateCharEncodingHandler((char *) out + start,
/* output */ 0, ctxt->convImpl, ctxt->convCtxt,
&handler);
if (res != 0)
return(res);
*hout = handler;
@ -1147,7 +1174,8 @@ done:
* Encoding handlers are stateful, so we have to recreate them.
*/
xmlCharEncCloseFunc(handler);
res = xmlLookupCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC, &handler);
res = xmlCreateCharEncodingHandler("IBM037", /* output */ 0,
ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != 0)
return(res);
*hout = handler;
@ -1184,7 +1212,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
res = 0;
break;
case XML_CHAR_ENCODING_EBCDIC:
res = xmlDetectEBCDIC(ctxt->input, &handler);
res = xmlDetectEBCDIC(ctxt, &handler);
break;
default:
res = xmlLookupCharEncodingHandler(enc, &handler);
@ -1224,7 +1252,8 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
if (encoding == NULL)
return(-1);
res = xmlOpenCharEncodingHandler(encoding, /* output */ 0, &handler);
res = xmlCreateCharEncodingHandler(encoding, /* output */ 0,
ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != 0) {
xmlFatalErr(ctxt, res, encoding);
return(-1);
@ -1267,7 +1296,7 @@ xmlSwitchEncodingName(xmlParserCtxtPtr ctxt, const char *encoding) {
*
* Returns an xmlParserErrors code.
*/
static int
int
xmlInputSetEncodingHandler(xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler) {
int nbchars;
@ -1341,33 +1370,6 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
return(XML_ERR_OK);
}
/**
* xmlInputSetEncoding:
* @input: the input stream
* @encoding: the encoding name
*
* Use specified encoding to decode input data. This overrides the
* encoding found in the XML declaration.
*
* Available since 2.14.0.
*
* Returns an xmlParserErrors code.
*/
int
xmlInputSetEncoding(xmlParserInputPtr input, const char *encoding) {
xmlCharEncodingHandlerPtr handler;
int res;
if (encoding == NULL)
return(XML_ERR_ARGUMENT);
res = xmlOpenCharEncodingHandler(encoding, /* output */ 0, &handler);
if (res != 0)
return(res);
return(xmlInputSetEncodingHandler(input, handler));
}
/**
* xmlSwitchInputEncoding:
* @ctxt: the parser context, only for error reporting
@ -2260,8 +2262,15 @@ xmlCheckHTTPInputInternal(xmlParserInputPtr input) {
if ((xmlStrstr(BAD_CAST mime, BAD_CAST "/xml")) ||
(xmlStrstr(BAD_CAST mime, BAD_CAST "+xml"))) {
encoding = xmlNanoHTTPEncoding(input->buf->context);
if (encoding != NULL)
xmlInputSetEncoding(input, encoding);
if (encoding != NULL) {
xmlCharEncodingHandlerPtr handler;
int res;
res = xmlOpenCharEncodingHandler(encoding, /* output */ 0,
&handler);
if (res == 0)
xmlInputSetEncodingHandler(input, handler);
}
}
redir = xmlNanoHTTPRedir(input->buf->context);

View File

@ -15876,38 +15876,6 @@ test_xmlInputCreateUrl(void) {
static int
test_xmlInputSetEncoding(void) {
int test_ret = 0;
int mem_base;
int ret_val;
xmlParserInputPtr input; /* the input stream */
int n_input;
const char * encoding; /* the encoding name */
int n_encoding;
for (n_input = 0;n_input < gen_nb_xmlParserInputPtr;n_input++) {
for (n_encoding = 0;n_encoding < gen_nb_const_char_ptr;n_encoding++) {
mem_base = xmlMemBlocks();
input = gen_xmlParserInputPtr(n_input, 0);
encoding = gen_const_char_ptr(n_encoding, 1);
ret_val = xmlInputSetEncoding(input, encoding);
desret_int(ret_val);
call_tests++;
des_xmlParserInputPtr(n_input, input, 0);
des_const_char_ptr(n_encoding, encoding, 1);
xmlResetLastError();
if (mem_base != xmlMemBlocks()) {
printf("Leak of %d blocks found in xmlInputSetEncoding",
xmlMemBlocks() - mem_base);
test_ret++;
printf(" %d", n_input);
printf(" %d", n_encoding);
printf("\n");
}
}
}
function_tests++;
return(test_ret);
}

View File

@ -518,6 +518,89 @@ testBuildRelativeUri(void) {
return err;
}
static int charEncConvImplError;
static int
rot13Convert(void *vctxt, unsigned char *out, int *outlen,
const unsigned char *in, int *inlen) {
int *ctxt = vctxt;
int inSize = *inlen;
int outSize = *outlen;
int rot, i;
rot = *ctxt;
for (i = 0; i < inSize && i < outSize; i++) {
int c = in[i];
if (c >= 'A' && c <= 'Z')
c = 'A' + (c - 'A' + rot) % 26;
else if (c >= 'a' && c <= 'z')
c = 'a' + (c - 'a' + rot) % 26;
out[i] = c;
}
*inlen = i;
*outlen = i;
return XML_ENC_ERR_SUCCESS;
}
static void
rot13ConvCtxtDtor(void *vctxt) {
xmlFree(vctxt);
}
static int
rot13ConvImpl(void *vctxt ATTRIBUTE_UNUSED, const char *name,
xmlCharEncConverter *conv) {
int *inputCtxt;
if (strcmp(name, "rot13") != 0) {
fprintf(stderr, "rot13ConvImpl received wrong name\n");
charEncConvImplError = 1;
return XML_ERR_UNSUPPORTED_ENCODING;
}
conv->convert = rot13Convert;
conv->ctxtDtor = rot13ConvCtxtDtor;
inputCtxt = xmlMalloc(sizeof(*inputCtxt));
*inputCtxt = 13;
conv->inputCtxt = inputCtxt;
return XML_ERR_OK;
}
static int
testCharEncConvImpl(void) {
xmlParserCtxtPtr ctxt;
xmlDocPtr doc;
xmlNodePtr root;
int err = 0;
ctxt = xmlNewParserCtxt();
xmlCtxtSetCharEncConvImpl(ctxt, rot13ConvImpl, NULL);
charEncConvImplError = 0;
doc = xmlCtxtReadDoc(ctxt, BAD_CAST "<?kzy irefvba='1.0'?><qbp/>", NULL,
"rot13", 0);
if (charEncConvImplError)
err = 1;
xmlFreeParserCtxt(ctxt);
root = xmlDocGetRootElement(doc);
if (root == NULL || strcmp((char *) root->name, "doc") != 0) {
fprintf(stderr, "testCharEncConvImpl failed\n");
err = 1;
}
xmlFreeDoc(doc);
return err;
}
int
main(void) {
int err = 0;
@ -546,6 +629,7 @@ main(void) {
err |= testWriterClose();
#endif
err |= testBuildRelativeUri();
err |= testCharEncConvImpl();
return err;
}