2008-07-24 19:05:38 +04:00
/**
* Test the UTF - 8 decoding routines
*
* author : Daniel Veillard
* copy : see Copyright for the status of this software .
*/
2023-03-13 21:38:41 +03:00
# define XML_DEPRECATED
2008-07-24 19:05:38 +04:00
# include <stdio.h>
# include <string.h>
2022-08-26 02:22:33 +03:00
# include <libxml/tree.h>
2008-07-24 19:05:38 +04:00
# include <libxml/parser.h>
# include <libxml/parserInternals.h>
int lastError ;
2023-10-24 16:02:36 +03:00
static void errorHandler ( void * unused , const xmlError * err ) {
2008-07-24 19:05:38 +04:00
if ( ( unused = = NULL ) & & ( err ! = NULL ) & & ( lastError = = 0 ) ) {
lastError = err - > code ;
}
}
char document1 [ 100 ] = " <doc>XXXX</doc> " ;
char document2 [ 100 ] = " <doc foo='XXXX'/> " ;
2022-04-03 18:54:23 +03:00
static int testDocumentRangeByte1 ( xmlParserCtxtPtr ctxt , char * document ,
2008-07-24 19:05:38 +04:00
int len , char * data , int forbid1 , int forbid2 ) {
int i ;
xmlDocPtr res ;
for ( i = 0 ; i < = 0xFF ; i + + ) {
lastError = 0 ;
xmlCtxtReset ( ctxt ) ;
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
2008-07-24 19:05:38 +04:00
res = xmlReadMemory ( document , len , " test " , NULL , 0 ) ;
if ( ( i = = forbid1 ) | | ( i = = forbid2 ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Byte 0x%02X: %c \n " ,
i , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
else if ( ( i = = ' < ' ) | | ( i = = ' & ' ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect illegal char %c for Byte 0x%02X \n " , i , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
else if ( ( ( i < 0x20 ) | | ( i > = 0x80 ) ) & &
( i ! = 0x9 ) & & ( i ! = 0xA ) & & ( i ! = 0xD ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError ! = XML_ERR_INVALID_CHAR ) & & ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Byte 0x%02X \n " , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
else if ( res = = NULL ) {
fprintf ( stderr ,
" Failed to parse valid char for Byte 0x%02X : %c \n " , i , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
if ( res ! = NULL )
xmlFreeDoc ( res ) ;
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
2022-04-03 18:54:23 +03:00
static int testDocumentRangeByte2 ( xmlParserCtxtPtr ctxt , char * document ,
2008-07-24 19:05:38 +04:00
int len , char * data ) {
int i , j ;
xmlDocPtr res ;
for ( i = 0x80 ; i < = 0xFF ; i + + ) {
for ( j = 0 ; j < = 0xFF ; j + + ) {
lastError = 0 ;
xmlCtxtReset ( ctxt ) ;
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
data [ 1 ] = ( char ) j ;
2008-07-24 19:05:38 +04:00
res = xmlReadMemory ( document , len , " test " , NULL , 0 ) ;
/* if first bit of first char is set, then second bit must too */
if ( ( i & 0x80 ) & & ( ( i & 0x40 ) = = 0 ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if first bit of first char is set , then second char first
* bits must be 10
*/
else if ( ( i & 0x80 ) & & ( ( j & 0xC0 ) ! = 0x80 ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if using a 2 byte encoding then the value must be greater
* than 0x80 , i . e . one of bits 5 to 1 of i must be set
*/
else if ( ( i & 0x80 ) & & ( ( i & 0x1E ) = = 0 ) ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if third bit of first char is set , then the sequence would need
* at least 3 bytes , but we give only 2 !
*/
else if ( ( i & 0xE0 ) = = 0xE0 ) {
2022-04-03 18:54:23 +03:00
if ( ( lastError = = 0 ) | | ( res ! = NULL ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00 \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
2019-09-30 18:04:54 +03:00
* We should see no error in remaining cases
2008-07-24 19:05:38 +04:00
*/
else if ( ( lastError ! = 0 ) | | ( res = = NULL ) ) {
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2008-07-24 19:05:38 +04:00
" Failed to parse document for Bytes 0x%02X 0x%02X \n " , i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
if ( res ! = NULL )
xmlFreeDoc ( res ) ;
}
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
/**
* testDocumentRanges :
*
* Test the correct UTF8 character parsing in context of XML documents
* Those are in - context injection tests checking the parser behaviour on
* edge case values at different point in content , beginning and end of
* CDATA in text or in attribute values .
*/
2022-04-03 18:54:23 +03:00
static int testDocumentRanges ( void ) {
2008-07-24 19:05:38 +04:00
xmlParserCtxtPtr ctxt ;
char * data ;
2022-04-03 18:54:23 +03:00
int test_ret = 0 ;
2008-07-24 19:05:38 +04:00
/*
* Set up a parsing context using the first document as
* the current input source .
*/
ctxt = xmlNewParserCtxt ( ) ;
if ( ctxt = = NULL ) {
fprintf ( stderr , " Failed to allocate parser context \n " ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
printf ( " testing 1 byte char in document: 1 " ) ;
fflush ( stdout ) ;
data = & document1 [ 5 ] ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 1 byte injection at beginning of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte1 ( ctxt , & document1 [ 0 ] , strlen ( document1 ) ,
2008-07-24 19:05:38 +04:00
data , - 1 , - 1 ) ;
printf ( " 2 " ) ;
fflush ( stdout ) ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 1 byte injection at end of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte1 ( ctxt , & document1 [ 0 ] , strlen ( document1 ) ,
2008-07-24 19:05:38 +04:00
data + 3 , - 1 , - 1 ) ;
printf ( " 3 " ) ;
fflush ( stdout ) ;
data = & document2 [ 10 ] ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 1 byte injection at beginning of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte1 ( ctxt , & document2 [ 0 ] , strlen ( document2 ) ,
2008-07-24 19:05:38 +04:00
data , ' \' ' , - 1 ) ;
printf ( " 4 " ) ;
fflush ( stdout ) ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 1 byte injection at end of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte1 ( ctxt , & document2 [ 0 ] , strlen ( document2 ) ,
2008-07-24 19:05:38 +04:00
data + 3 , ' \' ' , - 1 ) ;
printf ( " done \n " ) ;
printf ( " testing 2 byte char in document: 1 " ) ;
fflush ( stdout ) ;
data = & document1 [ 5 ] ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 2 byte injection at beginning of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte2 ( ctxt , & document1 [ 0 ] , strlen ( document1 ) ,
2008-07-24 19:05:38 +04:00
data ) ;
printf ( " 2 " ) ;
fflush ( stdout ) ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 2 byte injection at end of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte2 ( ctxt , & document1 [ 0 ] , strlen ( document1 ) ,
2008-07-24 19:05:38 +04:00
data + 2 ) ;
printf ( " 3 " ) ;
fflush ( stdout ) ;
data = & document2 [ 10 ] ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 2 byte injection at beginning of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte2 ( ctxt , & document2 [ 0 ] , strlen ( document2 ) ,
2008-07-24 19:05:38 +04:00
data ) ;
printf ( " 4 " ) ;
fflush ( stdout ) ;
data [ 0 ] = ' ' ;
data [ 1 ] = ' ' ;
data [ 2 ] = ' ' ;
data [ 3 ] = ' ' ;
/* test 2 byte injection at end of area */
2022-04-03 18:54:23 +03:00
test_ret + = testDocumentRangeByte2 ( ctxt , & document2 [ 0 ] , strlen ( document2 ) ,
2008-07-24 19:05:38 +04:00
data + 2 ) ;
printf ( " done \n " ) ;
xmlFreeParserCtxt ( ctxt ) ;
2022-04-03 18:54:23 +03:00
return ( test_ret ) ;
2008-07-24 19:05:38 +04:00
}
2023-10-22 17:32:08 +03:00
static int
testCurrentChar ( xmlParserCtxtPtr ctxt , int * len ) {
const xmlChar * oldcur ;
int c , err , len2 ;
lastError = 0 ;
c = xmlCurrentChar ( ctxt , len ) ;
ctxt - > input - > flags = 0 ;
err = lastError ;
oldcur = ctxt - > input - > cur ;
lastError = 0 ;
xmlNextChar ( ctxt ) ;
ctxt - > input - > flags = 0 ;
len2 = ctxt - > input - > cur - oldcur ;
ctxt - > input - > cur = oldcur ;
if ( ( * ctxt - > input - > cur ! = 0 ) & & ( err ! = lastError ) ) {
fprintf ( stderr , " xmlCurrentChar and xmlNextChar report different "
" errors: %d %d \n " , err , lastError ) ;
return ( - 1 ) ;
}
if ( ( err = = 0 ) & & ( * len ! = len2 ) ) {
fprintf ( stderr , " xmlCurrentChar and xmlNextChar report different "
" lengths: %d %d \n " , * len , len2 ) ;
return ( - 1 ) ;
}
lastError = err ;
return ( c ) ;
}
2022-11-15 00:27:58 +03:00
static int testCharRangeByte1 ( xmlParserCtxtPtr ctxt ) {
2008-07-24 19:05:38 +04:00
int i = 0 ;
int len , c ;
2022-11-15 00:27:58 +03:00
char * data = ( char * ) ctxt - > input - > cur ;
2008-07-24 19:05:38 +04:00
data [ 1 ] = 0 ;
data [ 2 ] = 0 ;
data [ 3 ] = 0 ;
for ( i = 0 ; i < = 0xFF ; i + + ) {
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
2022-12-27 16:15:51 +03:00
ctxt - > nbErrors = 0 ;
2008-07-24 19:05:38 +04:00
2023-10-22 17:32:08 +03:00
c = testCurrentChar ( ctxt , & len ) ;
if ( c < 0 )
continue ;
2023-12-19 22:47:36 +03:00
if ( i > = 0x80 ) {
2008-07-24 19:05:38 +04:00
/* we must see an error there */
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Byte 0x%02X \n " , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
} else if ( i = = 0xD ) {
2022-04-03 18:54:23 +03:00
if ( ( c ! = 0xA ) | | ( len ! = 1 ) ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr , " Failed to convert char for Byte 0x%02X \n " , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
} else if ( ( c ! = i ) | | ( len ! = 1 ) ) {
fprintf ( stderr , " Failed to parse char for Byte 0x%02X \n " , i ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
2022-11-15 00:27:58 +03:00
static int testCharRangeByte2 ( xmlParserCtxtPtr ctxt ) {
2008-07-24 19:05:38 +04:00
int i , j ;
int len , c ;
2022-11-15 00:27:58 +03:00
char * data = ( char * ) ctxt - > input - > cur ;
2008-07-24 19:05:38 +04:00
data [ 2 ] = 0 ;
data [ 3 ] = 0 ;
for ( i = 0x80 ; i < = 0xFF ; i + + ) {
for ( j = 0 ; j < = 0xFF ; j + + ) {
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
data [ 1 ] = ( char ) j ;
2022-12-27 16:15:51 +03:00
ctxt - > nbErrors = 0 ;
2008-07-24 19:05:38 +04:00
2023-10-22 17:32:08 +03:00
c = testCurrentChar ( ctxt , & len ) ;
if ( c < 0 )
continue ;
2008-07-24 19:05:38 +04:00
/* if first bit of first char is set, then second bit must too */
if ( ( i & 0x80 ) & & ( ( i & 0x40 ) = = 0 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if first bit of first char is set , then second char first
* bits must be 10
*/
else if ( ( i & 0x80 ) & & ( ( j & 0xC0 ) ! = 0x80 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d \n " ,
i , j , c ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if using a 2 byte encoding then the value must be greater
* than 0x80 , i . e . one of bits 5 to 1 of i must be set
*/
else if ( ( i & 0x80 ) & & ( ( i & 0x1E ) = = 0 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d \n " ,
i , j , c ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if third bit of first char is set , then the sequence would need
* at least 3 bytes , but we give only 2 !
*/
else if ( ( i & 0xE0 ) = = 0xE0 ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00 \n " ,
i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
2019-09-30 18:04:54 +03:00
* We should see no error in remaining cases
2008-07-24 19:05:38 +04:00
*/
else if ( ( lastError ! = 0 ) | | ( len ! = 2 ) ) {
fprintf ( stderr ,
" Failed to parse char for Bytes 0x%02X 0x%02X \n " , i , j ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
/*
* Finally check the value is right
*/
else if ( c ! = ( j & 0x3F ) + ( ( i & 0x1F ) < < 6 ) ) {
fprintf ( stderr ,
" Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d \n " ,
i , j , ( ( j & 0x3F ) + ( ( i & 0x1F ) < < 6 ) ) , c ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
}
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
2022-11-15 00:27:58 +03:00
static int testCharRangeByte3 ( xmlParserCtxtPtr ctxt ) {
2008-07-24 19:05:38 +04:00
int i , j , k , K ;
int len , c ;
unsigned char lows [ 6 ] = { 0 , 0x80 , 0x81 , 0xC1 , 0xFF , 0xBF } ;
2022-11-15 00:27:58 +03:00
char * data = ( char * ) ctxt - > input - > cur ;
2008-07-24 19:05:38 +04:00
int value ;
data [ 3 ] = 0 ;
for ( i = 0xE0 ; i < = 0xFF ; i + + ) {
for ( j = 0 ; j < = 0xFF ; j + + ) {
for ( k = 0 ; k < 6 ; k + + ) {
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
data [ 1 ] = ( char ) j ;
2008-07-24 19:05:38 +04:00
K = lows [ k ] ;
data [ 2 ] = ( char ) K ;
value = ( K & 0x3F ) + ( ( j & 0x3F ) < < 6 ) + ( ( i & 0xF ) < < 12 ) ;
2022-12-27 16:15:51 +03:00
ctxt - > nbErrors = 0 ;
2008-07-24 19:05:38 +04:00
2023-10-22 17:32:08 +03:00
c = testCurrentChar ( ctxt , & len ) ;
if ( c < 0 )
continue ;
2008-07-24 19:05:38 +04:00
/*
* if fourth bit of first char is set , then the sequence would need
* at least 4 bytes , but we give only 3 !
*/
if ( ( i & 0xF0 ) = = 0xF0 ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
i , j , K , data [ 3 ] ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* The second and the third bytes must start with 10
*/
else if ( ( ( j & 0xC0 ) ! = 0x80 ) | | ( ( K & 0xC0 ) ! = 0x80 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X \n " ,
i , j , K ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if using a 3 byte encoding then the value must be greater
* than 0x800 , i . e . one of bits 4 to 0 of i must be set or
* the 6 th byte of data [ 1 ] must be set
*/
else if ( ( ( i & 0xF ) = = 0 ) & & ( ( j & 0x20 ) = = 0 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X \n " ,
i , j , K ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
2023-10-22 16:56:46 +03:00
* There are values that are not allowed in UTF - 8
2008-07-24 19:05:38 +04:00
*/
2023-10-22 16:56:46 +03:00
else if ( ( value > 0xD7FF ) & & ( value < 0xE000 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X \n " ,
value , i , j , K ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* We should see no error in remaining cases
*/
else if ( ( lastError ! = 0 ) | | ( len ! = 3 ) ) {
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2008-07-24 19:05:38 +04:00
" Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X \n " ,
i , j , K ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
/*
* Finally check the value is right
*/
else if ( c ! = value ) {
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2008-07-24 19:05:38 +04:00
" Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d \n " ,
i , j , data [ 2 ] , value , c ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
}
}
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
2022-11-15 00:27:58 +03:00
static int testCharRangeByte4 ( xmlParserCtxtPtr ctxt ) {
2008-07-24 19:05:38 +04:00
int i , j , k , K , l , L ;
int len , c ;
unsigned char lows [ 6 ] = { 0 , 0x80 , 0x81 , 0xC1 , 0xFF , 0xBF } ;
2022-11-15 00:27:58 +03:00
char * data = ( char * ) ctxt - > input - > cur ;
2008-07-24 19:05:38 +04:00
int value ;
data [ 4 ] = 0 ;
for ( i = 0xF0 ; i < = 0xFF ; i + + ) {
for ( j = 0 ; j < = 0xFF ; j + + ) {
for ( k = 0 ; k < 6 ; k + + ) {
for ( l = 0 ; l < 6 ; l + + ) {
2022-01-25 03:59:03 +03:00
data [ 0 ] = ( char ) i ;
data [ 1 ] = ( char ) j ;
2008-07-24 19:05:38 +04:00
K = lows [ k ] ;
data [ 2 ] = ( char ) K ;
L = lows [ l ] ;
data [ 3 ] = ( char ) L ;
value = ( L & 0x3F ) + ( ( K & 0x3F ) < < 6 ) + ( ( j & 0x3F ) < < 12 ) +
( ( i & 0x7 ) < < 18 ) ;
2022-12-27 16:15:51 +03:00
ctxt - > nbErrors = 0 ;
2008-07-24 19:05:38 +04:00
2023-10-22 17:32:08 +03:00
c = testCurrentChar ( ctxt , & len ) ;
if ( c < 0 )
continue ;
2008-07-24 19:05:38 +04:00
/*
* if fifth bit of first char is set , then the sequence would need
* at least 5 bytes , but we give only 4 !
*/
if ( ( i & 0xF8 ) = = 0xF8 ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
i , j , K , data [ 3 ] ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* The second , third and fourth bytes must start with 10
*/
else if ( ( ( j & 0xC0 ) ! = 0x80 ) | | ( ( K & 0xC0 ) ! = 0x80 ) | |
( ( L & 0xC0 ) ! = 0x80 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
i , j , K , L ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* if using a 3 byte encoding then the value must be greater
* than 0x10000 , i . e . one of bits 3 to 0 of i must be set or
* the 6 or 5 th byte of j must be set
*/
else if ( ( ( i & 0x7 ) = = 0 ) & & ( ( j & 0x30 ) = = 0 ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
i , j , K , L ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
2023-10-22 16:56:46 +03:00
* There are values in that are not allowed in UTF - 8
2008-07-24 19:05:38 +04:00
*/
2023-10-22 16:56:46 +03:00
else if ( ( ( value > 0xD7FF ) & & ( value < 0xE000 ) ) | |
2008-07-24 19:05:38 +04:00
( value > 0x10FFFF ) ) {
2023-12-19 22:47:36 +03:00
if ( lastError ! = XML_ERR_INVALID_ENCODING ) {
2008-07-24 19:05:38 +04:00
fprintf ( stderr ,
" Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
value , i , j , K , L ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
}
2008-07-24 19:05:38 +04:00
}
/*
* We should see no error in remaining cases
*/
else if ( ( lastError ! = 0 ) | | ( len ! = 4 ) ) {
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2008-07-24 19:05:38 +04:00
" Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X \n " ,
i , j , K ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
/*
* Finally check the value is right
*/
else if ( c ! = value ) {
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2008-07-24 19:05:38 +04:00
" Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d \n " ,
i , j , data [ 2 ] , value , c ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
}
}
}
}
2022-04-03 18:54:23 +03:00
return ( 0 ) ;
2008-07-24 19:05:38 +04:00
}
/**
* testCharRanges :
*
* Test the correct UTF8 character parsing in isolation i . e .
* not when parsing a full document , this is less expensive and we can
* cover the full range of UTF - 8 chars accepted by XML - 1.0
*/
2022-04-03 18:54:23 +03:00
static int testCharRanges ( void ) {
2008-07-24 19:05:38 +04:00
char data [ 5 ] ;
xmlParserCtxtPtr ctxt ;
xmlParserInputBufferPtr buf ;
xmlParserInputPtr input ;
2022-04-03 18:54:23 +03:00
int test_ret = 0 ;
2008-07-24 19:05:38 +04:00
memset ( data , 0 , 5 ) ;
/*
* Set up a parsing context using the above data buffer as
* the current input source .
*/
ctxt = xmlNewParserCtxt ( ) ;
if ( ctxt = = NULL ) {
fprintf ( stderr , " Failed to allocate parser context \n " ) ;
2022-04-03 18:54:23 +03:00
return ( 1 ) ;
2008-07-24 19:05:38 +04:00
}
2023-12-13 01:51:32 +03:00
buf = xmlParserInputBufferCreateStatic ( data , sizeof ( data ) ,
XML_CHAR_ENCODING_NONE ) ;
2008-07-24 19:05:38 +04:00
if ( buf = = NULL ) {
fprintf ( stderr , " Failed to allocate input buffer \n " ) ;
2022-04-03 18:54:23 +03:00
test_ret = 1 ;
2008-07-24 19:05:38 +04:00
goto error ;
}
input = xmlNewInputStream ( ctxt ) ;
if ( input = = NULL ) {
xmlFreeParserInputBuffer ( buf ) ;
2022-04-03 18:54:23 +03:00
test_ret = 1 ;
2008-07-24 19:05:38 +04:00
goto error ;
}
input - > filename = NULL ;
input - > buf = buf ;
2012-07-16 10:59:29 +04:00
input - > cur =
input - > base = xmlBufContent ( input - > buf - > buffer ) ;
input - > end = input - > base + 4 ;
2008-07-24 19:05:38 +04:00
inputPush ( ctxt , input ) ;
printf ( " testing char range: 1 " ) ;
fflush ( stdout ) ;
2022-11-15 00:27:58 +03:00
test_ret + = testCharRangeByte1 ( ctxt ) ;
2008-07-24 19:05:38 +04:00
printf ( " 2 " ) ;
fflush ( stdout ) ;
2022-11-15 00:27:58 +03:00
test_ret + = testCharRangeByte2 ( ctxt ) ;
2008-07-24 19:05:38 +04:00
printf ( " 3 " ) ;
fflush ( stdout ) ;
2022-11-15 00:27:58 +03:00
test_ret + = testCharRangeByte3 ( ctxt ) ;
2008-07-24 19:05:38 +04:00
printf ( " 4 " ) ;
fflush ( stdout ) ;
2022-11-15 00:27:58 +03:00
test_ret + = testCharRangeByte4 ( ctxt ) ;
2008-07-24 19:05:38 +04:00
printf ( " done \n " ) ;
fflush ( stdout ) ;
error :
xmlFreeParserCtxt ( ctxt ) ;
2022-04-03 18:54:23 +03:00
return ( test_ret ) ;
2008-07-24 19:05:38 +04:00
}
2023-04-20 14:52:16 +03:00
static int
testUserEncoding ( void ) {
/*
* Create a document encoded as UTF - 16L E with an ISO - 8859 - 1 encoding
* declaration , then parse it with xmlReadMemory and the encoding
* argument set to UTF - 16L E .
*/
xmlDocPtr doc = NULL ;
const char * start = " <?xml version='1.0' encoding='ISO-8859-1'?><d> " ;
const char * end = " </d> " ;
char * buf = NULL ;
xmlChar * text ;
int startSize = strlen ( start ) ;
int textSize = 100000 ; /* Make sure to exceed internal buffer sizes. */
int endSize = strlen ( end ) ;
int totalSize = startSize + textSize + endSize ;
int k = 0 ;
int i ;
2023-04-21 03:56:10 +03:00
int ret = 1 ;
2023-04-20 14:52:16 +03:00
buf = xmlMalloc ( 2 * totalSize ) ;
for ( i = 0 ; start [ i ] ! = 0 ; i + + ) {
buf [ k + + ] = start [ i ] ;
buf [ k + + ] = 0 ;
}
for ( i = 0 ; i < textSize ; i + + ) {
buf [ k + + ] = ' x ' ;
buf [ k + + ] = 0 ;
}
for ( i = 0 ; end [ i ] ! = 0 ; i + + ) {
buf [ k + + ] = end [ i ] ;
buf [ k + + ] = 0 ;
}
doc = xmlReadMemory ( buf , 2 * totalSize , NULL , " UTF-16LE " , 0 ) ;
if ( doc = = NULL ) {
fprintf ( stderr , " failed to parse document \n " ) ;
goto error ;
}
2023-04-21 03:56:10 +03:00
text = doc - > children - > children - > content ;
2023-04-20 14:52:16 +03:00
for ( i = 0 ; i < textSize ; i + + ) {
if ( text [ i ] ! = ' x ' ) {
fprintf ( stderr , " text node has wrong content at offset %d \n " , k ) ;
goto error ;
}
}
2023-04-21 03:56:10 +03:00
ret = 0 ;
2023-04-20 14:52:16 +03:00
error :
xmlFreeDoc ( doc ) ;
xmlFree ( buf ) ;
return ret ;
}
2023-08-08 16:19:49 +03:00
# if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
static char *
convert ( xmlCharEncodingHandlerPtr handler , const char * utf8 , int size ,
int * outSize ) {
char * ret ;
int inlen ;
int res ;
inlen = size ;
* outSize = size * 2 ;
ret = xmlMalloc ( * outSize ) ;
if ( ret = = NULL )
return ( NULL ) ;
res = handler - > output ( BAD_CAST ret , outSize , BAD_CAST utf8 , & inlen ) ;
if ( ( res < 0 ) | | ( inlen ! = size ) ) {
xmlFree ( ret ) ;
return ( NULL ) ;
}
return ( ret ) ;
}
static int
testUserEncodingPush ( void ) {
xmlCharEncodingHandlerPtr handler ;
xmlParserCtxtPtr ctxt ;
xmlDocPtr doc ;
char buf [ ] =
" \xEF \xBB \xBF "
" <?xml version='1.0' encoding='ISO-8859-1'?> \n "
" <d>text</d> \n " ;
char * utf16 ;
int utf16Size ;
int ret = 1 ;
handler = xmlGetCharEncodingHandler ( XML_CHAR_ENCODING_UTF16LE ) ;
utf16 = convert ( handler , buf , sizeof ( buf ) - 1 , & utf16Size ) ;
ctxt = xmlCreatePushParserCtxt ( NULL , NULL , NULL , 0 , NULL ) ;
xmlSwitchEncoding ( ctxt , XML_CHAR_ENCODING_UTF16LE ) ;
xmlParseChunk ( ctxt , utf16 , utf16Size , 0 ) ;
xmlParseChunk ( ctxt , NULL , 0 , 1 ) ;
doc = ctxt - > myDoc ;
if ( ( doc ! = NULL ) & &
( doc - > children ! = NULL ) & &
( doc - > children - > children ! = NULL ) & &
( xmlStrcmp ( doc - > children - > children - > content , BAD_CAST " text " ) = = 0 ) )
ret = 0 ;
xmlFreeDoc ( doc ) ;
xmlFreeParserCtxt ( ctxt ) ;
xmlFree ( utf16 ) ;
return ( ret ) ;
}
2023-05-18 20:23:58 +03:00
static int
testUTF8Chunks ( void ) {
xmlParserCtxtPtr ctxt ;
xmlChar * out ;
int outSize ;
char * buf ;
int i ;
int ret = 0 ;
ctxt = xmlCreatePushParserCtxt ( NULL , NULL , NULL , 0 , NULL ) ;
xmlParseChunk ( ctxt , " <d> " , 3 , 0 ) ;
xmlParseChunk ( ctxt , " \xF0 " , 1 , 0 ) ;
xmlParseChunk ( ctxt , " \x9F " , 1 , 0 ) ;
xmlParseChunk ( ctxt , " \x98 " , 1 , 0 ) ;
xmlParseChunk ( ctxt , " \x8A " , 1 , 0 ) ;
xmlParseChunk ( ctxt , " </d> " , 4 , 1 ) ;
xmlDocDumpMemory ( ctxt - > myDoc , & out , & outSize ) ;
if ( strcmp ( ( char * ) out ,
" <?xml version= \" 1.0 \" ?> \n <d>😊</d> \n " ) ! = 0 ) {
fprintf ( stderr , " failed UTF-8 chunk test 1 \n " ) ;
ret + = 1 ;
}
xmlFree ( out ) ;
xmlFreeDoc ( ctxt - > myDoc ) ;
xmlFreeParserCtxt ( ctxt ) ;
ctxt = xmlCreatePushParserCtxt ( NULL , NULL , NULL , 0 , NULL ) ;
xmlParseChunk ( ctxt , " <d> " , 3 , 0 ) ;
/*
* Create a chunk longer than XML_PARSER_BIG_BUFFER_SIZE ( 300 ) ending
* with an incomplete UTF - 8 sequence .
*/
buf = xmlMalloc ( 1000 * 2 + 1 ) ;
for ( i = 0 ; i < 2000 ; i + = 2 )
memcpy ( buf + i , " \xCE \xB1 " , 2 ) ;
buf [ i ] = ' \xCE ' ;
xmlParseChunk ( ctxt , buf , 2001 , 0 ) ;
xmlFree ( buf ) ;
xmlParseChunk ( ctxt , " \xB1 </d> " , 4 , 0 ) ;
xmlParseChunk ( ctxt , NULL , 0 , 0 ) ;
xmlDocDumpMemory ( ctxt - > myDoc , & out , & outSize ) ;
if ( strncmp ( ( char * ) out , " <?xml version= \" 1.0 \" ?> \n <d> " , 25 ) ! = 0 ) {
fprintf ( stderr , " failed UTF-8 chunk test 2-1 \n " ) ;
ret + = 1 ;
goto error ;
}
for ( i = 25 ; i < 25 + 1001 * 7 ; i + = 7 ) {
if ( memcmp ( out + i , " α " , 7 ) ! = 0 ) {
fprintf ( stderr , " failed UTF-8 chunk test 2-2 %d \n " , i ) ;
ret + = 1 ;
goto error ;
}
}
if ( strcmp ( ( char * ) out + i , " </d> \n " ) ! = 0 ) {
fprintf ( stderr , " failed UTF-8 chunk test 2-3 \n " ) ;
ret + = 1 ;
goto error ;
}
error :
xmlFree ( out ) ;
xmlFreeDoc ( ctxt - > myDoc ) ;
xmlFreeParserCtxt ( ctxt ) ;
return ( ret ) ;
return ( 0 ) ;
}
2023-08-08 16:19:49 +03:00
# endif
2008-07-24 19:05:38 +04:00
int main ( void ) {
2022-04-03 18:54:23 +03:00
int ret = 0 ;
2008-07-24 19:05:38 +04:00
/*
* this initialize the library and check potential ABI mismatches
* between the version it was compiled for and the actual shared
* library used .
*/
LIBXML_TEST_VERSION
/*
* Catch errors separately
*/
xmlSetStructuredErrorFunc ( NULL , errorHandler ) ;
/*
* Run the tests
*/
2022-04-03 18:54:23 +03:00
ret + = testCharRanges ( ) ;
ret + = testDocumentRanges ( ) ;
2023-04-20 14:52:16 +03:00
ret + = testUserEncoding ( ) ;
2023-08-08 16:19:49 +03:00
# if defined(LIBXML_PUSH_ENABLED) && defined(LIBXML_OUTPUT_ENABLED)
ret + = testUserEncodingPush ( ) ;
2023-05-18 20:23:58 +03:00
ret + = testUTF8Chunks ( ) ;
2023-08-08 16:19:49 +03:00
# endif
2008-07-24 19:05:38 +04:00
/*
* Cleanup function for the XML library .
*/
xmlCleanupParser ( ) ;
2022-04-03 18:54:23 +03:00
return ( ret ? 1 : 0 ) ;
2008-07-24 19:05:38 +04:00
}