2020-08-24 17:28:54 +03:00
/*
* xmlSeed . c : Generate the XML seed corpus for fuzzing .
*
* See Copyright for the status of this software .
*/
# include <stdio.h>
# include <string.h>
# include <glob.h>
# include <libgen.h>
# include <sys/stat.h>
# ifdef _WIN32
# include <direct.h>
# else
# include <unistd.h>
# endif
# include <libxml/parser.h>
# include <libxml/parserInternals.h>
# include <libxml/HTMLparser.h>
# include <libxml/xinclude.h>
# include <libxml/xmlschemas.h>
# include "fuzz.h"
# define PATH_SIZE 500
# define SEED_BUF_SIZE 16384
# define EXPR_SIZE 4500
2024-04-16 14:24:12 +03:00
# define FLAG_READER (1 << 0)
2024-05-13 13:18:08 +03:00
# define FLAG_LINT (1 << 1)
2024-04-16 14:24:12 +03:00
2020-08-24 17:28:54 +03:00
typedef int
( * fileFunc ) ( const char * base , FILE * out ) ;
typedef int
( * mainFunc ) ( const char * arg ) ;
static struct {
FILE * out ;
xmlHashTablePtr entities ; /* Maps URLs to xmlFuzzEntityInfos */
xmlExternalEntityLoader oldLoader ;
fileFunc processFile ;
const char * fuzzer ;
int counter ;
char cwd [ PATH_SIZE ] ;
2024-04-16 14:24:12 +03:00
int flags ;
2020-08-24 17:28:54 +03:00
} globalData ;
2022-09-02 18:47:48 +03:00
# if defined(HAVE_SCHEMA_FUZZER) || \
defined ( HAVE_XML_FUZZER )
2020-08-24 17:28:54 +03:00
/*
2024-06-11 16:48:32 +03:00
* A custom resource loader that writes all external DTDs or entities to a
* single file in the format expected by xmlFuzzResourceLoader .
2020-08-24 17:28:54 +03:00
*/
2024-06-11 16:48:32 +03:00
static int
fuzzResourceRecorder ( void * data ATTRIBUTE_UNUSED , const char * URL ,
const char * ID ATTRIBUTE_UNUSED ,
2024-06-11 20:10:41 +03:00
xmlResourceType type ATTRIBUTE_UNUSED , int flags ,
2024-06-11 16:48:32 +03:00
xmlParserInputPtr * out ) {
2020-08-24 17:28:54 +03:00
xmlParserInputPtr in ;
static const int chunkSize = 16384 ;
2024-06-11 16:48:32 +03:00
int code , len ;
* out = NULL ;
2020-08-24 17:28:54 +03:00
2024-06-11 16:48:32 +03:00
code = xmlInputCreateUrl ( URL , flags , & in ) ;
if ( code ! = XML_ERR_OK )
return ( code ) ;
2020-08-24 17:28:54 +03:00
if ( globalData . entities = = NULL ) {
globalData . entities = xmlHashCreate ( 4 ) ;
} else if ( xmlHashLookup ( globalData . entities ,
( const xmlChar * ) URL ) ! = NULL ) {
2024-06-11 16:48:32 +03:00
* out = in ;
return ( XML_ERR_OK ) ;
2020-08-24 17:28:54 +03:00
}
do {
len = xmlParserInputBufferGrow ( in - > buf , chunkSize ) ;
if ( len < 0 ) {
fprintf ( stderr , " Error reading %s \n " , URL ) ;
xmlFreeInputStream ( in ) ;
2024-06-11 16:48:32 +03:00
return ( in - > buf - > error ) ;
2020-08-24 17:28:54 +03:00
}
} while ( len > 0 ) ;
xmlFuzzWriteString ( globalData . out , URL ) ;
xmlFuzzWriteString ( globalData . out ,
( char * ) xmlBufContent ( in - > buf - > buffer ) ) ;
xmlFreeInputStream ( in ) ;
2023-02-28 23:16:12 +03:00
xmlHashAddEntry ( globalData . entities , ( const xmlChar * ) URL ,
globalData . entities ) ;
2020-08-24 17:28:54 +03:00
2024-06-11 16:48:32 +03:00
return ( xmlInputCreateUrl ( URL , flags , out ) ) ;
2020-08-24 17:28:54 +03:00
}
static void
fuzzRecorderInit ( FILE * out ) {
globalData . out = out ;
globalData . entities = xmlHashCreate ( 8 ) ;
globalData . oldLoader = xmlGetExternalEntityLoader ( ) ;
}
static void
2022-09-02 18:47:48 +03:00
fuzzRecorderCleanup ( void ) {
2023-02-28 23:16:12 +03:00
xmlHashFree ( globalData . entities , NULL ) ;
2020-08-24 17:28:54 +03:00
globalData . out = NULL ;
globalData . entities = NULL ;
globalData . oldLoader = NULL ;
}
2022-09-02 18:47:48 +03:00
# endif
2020-08-24 17:28:54 +03:00
2021-02-22 23:26:13 +03:00
# ifdef HAVE_XML_FUZZER
2020-08-24 17:28:54 +03:00
static int
processXml ( const char * docFile , FILE * out ) {
int opts = XML_PARSE_NOENT | XML_PARSE_DTDLOAD ;
2024-06-11 16:48:32 +03:00
xmlParserCtxtPtr ctxt ;
2020-08-24 17:28:54 +03:00
xmlDocPtr doc ;
2024-05-13 13:18:08 +03:00
if ( globalData . flags & FLAG_LINT ) {
/* Switches */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
xmlFuzzWriteInt ( out , 0 , 4 ) ;
/* maxmem */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
/* max-ampl */
xmlFuzzWriteInt ( out , 0 , 1 ) ;
/* pretty */
xmlFuzzWriteInt ( out , 0 , 1 ) ;
/* encode */
xmlFuzzWriteString ( out , " " ) ;
/* pattern */
xmlFuzzWriteString ( out , " " ) ;
/* xpath */
xmlFuzzWriteString ( out , " " ) ;
} else {
/* Parser options. */
xmlFuzzWriteInt ( out , opts , 4 ) ;
/* Max allocations. */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
if ( globalData . flags & FLAG_READER ) {
/* Initial reader program with a couple of OP_READs */
xmlFuzzWriteString ( out , " \x01 \x01 \x01 \x01 \x01 \x01 \x01 \x01 " ) ;
}
2024-04-16 14:24:12 +03:00
}
2020-08-24 17:28:54 +03:00
fuzzRecorderInit ( out ) ;
2024-06-11 16:48:32 +03:00
ctxt = xmlNewParserCtxt ( ) ;
2024-06-11 17:58:09 +03:00
xmlCtxtSetErrorHandler ( ctxt , xmlFuzzSErrorFunc , NULL ) ;
2024-06-11 16:48:32 +03:00
xmlCtxtSetResourceLoader ( ctxt , fuzzResourceRecorder , NULL ) ;
doc = xmlCtxtReadFile ( ctxt , docFile , NULL , opts ) ;
2023-09-21 14:05:49 +03:00
# ifdef LIBXML_XINCLUDE_ENABLED
2024-06-11 16:48:32 +03:00
{
xmlXIncludeCtxtPtr xinc = xmlXIncludeNewContext ( doc ) ;
2024-06-11 17:58:09 +03:00
xmlXIncludeSetErrorHandler ( xinc , xmlFuzzSErrorFunc , NULL ) ;
2024-06-11 16:48:32 +03:00
xmlXIncludeSetResourceLoader ( xinc , fuzzResourceRecorder , NULL ) ;
xmlXIncludeSetFlags ( xinc , opts ) ;
xmlXIncludeProcessNode ( xinc , ( xmlNodePtr ) doc ) ;
xmlXIncludeFreeContext ( xinc ) ;
}
2023-09-21 14:05:49 +03:00
# endif
2020-08-24 17:28:54 +03:00
xmlFreeDoc ( doc ) ;
2024-06-11 16:48:32 +03:00
xmlFreeParserCtxt ( ctxt ) ;
2020-08-24 17:28:54 +03:00
fuzzRecorderCleanup ( ) ;
return ( 0 ) ;
}
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
2021-02-22 23:26:13 +03:00
# ifdef HAVE_HTML_FUZZER
2020-08-24 17:28:54 +03:00
static int
processHtml ( const char * docFile , FILE * out ) {
char buf [ SEED_BUF_SIZE ] ;
FILE * file ;
size_t size ;
2023-03-08 15:59:00 +03:00
/* Parser options. */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
2023-03-08 15:59:03 +03:00
/* Max allocations. */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
2020-08-24 17:28:54 +03:00
/* Copy file */
file = fopen ( docFile , " rb " ) ;
if ( file = = NULL ) {
fprintf ( stderr , " couldn't open %s \n " , docFile ) ;
return ( 0 ) ;
}
do {
size = fread ( buf , 1 , SEED_BUF_SIZE , file ) ;
if ( size > 0 )
fwrite ( buf , 1 , size , out ) ;
} while ( size = = SEED_BUF_SIZE ) ;
fclose ( file ) ;
return ( 0 ) ;
}
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
2021-02-22 23:26:13 +03:00
# ifdef HAVE_SCHEMA_FUZZER
2020-08-24 17:28:54 +03:00
static int
processSchema ( const char * docFile , FILE * out ) {
xmlSchemaPtr schema ;
xmlSchemaParserCtxtPtr pctxt ;
2023-03-08 15:59:03 +03:00
/* Max allocations. */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
2020-08-24 17:28:54 +03:00
fuzzRecorderInit ( out ) ;
pctxt = xmlSchemaNewParserCtxt ( docFile ) ;
2024-06-11 17:58:09 +03:00
xmlSchemaSetParserStructuredErrors ( pctxt , xmlFuzzSErrorFunc , NULL ) ;
2024-06-11 16:48:32 +03:00
xmlSchemaSetResourceLoader ( pctxt , fuzzResourceRecorder , NULL ) ;
2020-08-24 17:28:54 +03:00
schema = xmlSchemaParse ( pctxt ) ;
xmlSchemaFreeParserCtxt ( pctxt ) ;
xmlSchemaFree ( schema ) ;
fuzzRecorderCleanup ( ) ;
return ( 0 ) ;
}
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
2022-09-02 18:47:48 +03:00
# if defined(HAVE_HTML_FUZZER) || \
defined ( HAVE_SCHEMA_FUZZER ) | | \
defined ( HAVE_XML_FUZZER )
2020-08-24 17:28:54 +03:00
static int
processPattern ( const char * pattern ) {
glob_t globbuf ;
int ret = 0 ;
2022-09-02 18:47:48 +03:00
int res ;
size_t i ;
2020-08-24 17:28:54 +03:00
res = glob ( pattern , 0 , NULL , & globbuf ) ;
if ( res = = GLOB_NOMATCH )
return ( 0 ) ;
if ( res ! = 0 ) {
fprintf ( stderr , " couldn't match pattern %s \n " , pattern ) ;
return ( - 1 ) ;
}
for ( i = 0 ; i < globbuf . gl_pathc ; i + + ) {
struct stat statbuf ;
char outPath [ PATH_SIZE ] ;
char * dirBuf = NULL ;
char * baseBuf = NULL ;
const char * path , * dir , * base ;
FILE * out = NULL ;
int dirChanged = 0 ;
size_t size ;
path = globbuf . gl_pathv [ i ] ;
if ( ( stat ( path , & statbuf ) ! = 0 ) | | ( ! S_ISREG ( statbuf . st_mode ) ) )
continue ;
dirBuf = ( char * ) xmlCharStrdup ( path ) ;
baseBuf = ( char * ) xmlCharStrdup ( path ) ;
if ( ( dirBuf = = NULL ) | | ( baseBuf = = NULL ) ) {
fprintf ( stderr , " memory allocation failed \n " ) ;
ret = - 1 ;
goto error ;
}
dir = dirname ( dirBuf ) ;
base = basename ( baseBuf ) ;
size = snprintf ( outPath , sizeof ( outPath ) , " seed/%s/%s " ,
globalData . fuzzer , base ) ;
if ( size > = PATH_SIZE ) {
fprintf ( stderr , " creating path failed \n " ) ;
ret = - 1 ;
goto error ;
}
out = fopen ( outPath , " wb " ) ;
if ( out = = NULL ) {
fprintf ( stderr , " couldn't open %s for writing \n " , outPath ) ;
ret = - 1 ;
goto error ;
}
if ( chdir ( dir ) ! = 0 ) {
fprintf ( stderr , " couldn't chdir to %s \n " , dir ) ;
ret = - 1 ;
goto error ;
}
dirChanged = 1 ;
if ( globalData . processFile ( base , out ) ! = 0 )
ret = - 1 ;
error :
if ( out ! = NULL )
fclose ( out ) ;
xmlFree ( dirBuf ) ;
xmlFree ( baseBuf ) ;
if ( ( dirChanged ) & & ( chdir ( globalData . cwd ) ! = 0 ) ) {
fprintf ( stderr , " couldn't chdir to %s \n " , globalData . cwd ) ;
ret = - 1 ;
break ;
}
}
globfree ( & globbuf ) ;
return ( ret ) ;
}
2022-09-02 18:47:48 +03:00
# endif
2020-08-24 17:28:54 +03:00
2021-02-22 23:26:13 +03:00
# ifdef HAVE_XPATH_FUZZER
2020-08-24 17:28:54 +03:00
static int
processXPath ( const char * testDir , const char * prefix , const char * name ,
const char * data , const char * subdir , int xptr ) {
char pattern [ PATH_SIZE ] ;
glob_t globbuf ;
size_t i , size ;
int ret = 0 , res ;
size = snprintf ( pattern , sizeof ( pattern ) , " %s/%s/%s* " ,
testDir , subdir , prefix ) ;
if ( size > = PATH_SIZE )
return ( - 1 ) ;
res = glob ( pattern , 0 , NULL , & globbuf ) ;
if ( res = = GLOB_NOMATCH )
return ( 0 ) ;
if ( res ! = 0 ) {
fprintf ( stderr , " couldn't match pattern %s \n " , pattern ) ;
return ( - 1 ) ;
}
for ( i = 0 ; i < globbuf . gl_pathc ; i + + ) {
char * path = globbuf . gl_pathv [ i ] ;
struct stat statbuf ;
FILE * in ;
char expr [ EXPR_SIZE ] ;
if ( ( stat ( path , & statbuf ) ! = 0 ) | | ( ! S_ISREG ( statbuf . st_mode ) ) )
continue ;
in = fopen ( path , " rb " ) ;
if ( in = = NULL ) {
ret = - 1 ;
continue ;
}
2022-09-02 18:47:48 +03:00
while ( fgets ( expr , EXPR_SIZE , in ) ! = NULL ) {
2020-08-24 17:28:54 +03:00
char outPath [ PATH_SIZE ] ;
FILE * out ;
int j ;
for ( j = 0 ; expr [ j ] ! = 0 ; j + + )
if ( expr [ j ] = = ' \r ' | | expr [ j ] = = ' \n ' )
break ;
expr [ j ] = 0 ;
size = snprintf ( outPath , sizeof ( outPath ) , " seed/xpath/%s-%d " ,
name , globalData . counter ) ;
if ( size > = PATH_SIZE ) {
ret = - 1 ;
continue ;
}
out = fopen ( outPath , " wb " ) ;
if ( out = = NULL ) {
ret = - 1 ;
continue ;
}
2023-03-08 15:59:03 +03:00
/* Max allocations. */
xmlFuzzWriteInt ( out , 0 , 4 ) ;
2020-08-24 17:28:54 +03:00
if ( xptr ) {
xmlFuzzWriteString ( out , expr ) ;
} else {
char xptrExpr [ EXPR_SIZE + 100 ] ;
/* Wrap XPath expressions as XPointer */
snprintf ( xptrExpr , sizeof ( xptrExpr ) , " xpointer(%s) " , expr ) ;
xmlFuzzWriteString ( out , xptrExpr ) ;
}
xmlFuzzWriteString ( out , data ) ;
fclose ( out ) ;
globalData . counter + + ;
}
fclose ( in ) ;
}
globfree ( & globbuf ) ;
return ( ret ) ;
}
2022-09-02 18:47:48 +03:00
static int
2020-08-24 17:28:54 +03:00
processXPathDir ( const char * testDir ) {
char pattern [ PATH_SIZE ] ;
glob_t globbuf ;
size_t i , size ;
int ret = 0 ;
globalData . counter = 1 ;
if ( processXPath ( testDir , " " , " expr " , " <d></d> " , " expr " , 0 ) ! = 0 )
ret = - 1 ;
size = snprintf ( pattern , sizeof ( pattern ) , " %s/docs/* " , testDir ) ;
if ( size > = PATH_SIZE )
return ( 1 ) ;
if ( glob ( pattern , 0 , NULL , & globbuf ) ! = 0 )
return ( 1 ) ;
for ( i = 0 ; i < globbuf . gl_pathc ; i + + ) {
char * path = globbuf . gl_pathv [ i ] ;
char * data ;
const char * docFile ;
data = xmlSlurpFile ( path , NULL ) ;
if ( data = = NULL ) {
ret = - 1 ;
continue ;
}
docFile = basename ( path ) ;
globalData . counter = 1 ;
if ( processXPath ( testDir , docFile , docFile , data , " tests " , 0 ) ! = 0 )
ret = - 1 ;
if ( processXPath ( testDir , docFile , docFile , data , " xptr " , 1 ) ! = 0 )
ret = - 1 ;
2022-04-21 04:52:52 +03:00
if ( processXPath ( testDir , docFile , docFile , data , " xptr-xp1 " , 1 ) ! = 0 )
ret = - 1 ;
2020-08-24 17:28:54 +03:00
xmlFree ( data ) ;
}
globfree ( & globbuf ) ;
return ( ret ) ;
}
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
int
main ( int argc , const char * * argv ) {
2021-02-22 23:26:13 +03:00
mainFunc processArg = NULL ;
2020-08-24 17:28:54 +03:00
const char * fuzzer ;
int ret = 0 ;
int i ;
if ( argc < 3 ) {
fprintf ( stderr , " usage: seed [FUZZER] [PATTERN...] \n " ) ;
return ( 1 ) ;
}
fuzzer = argv [ 1 ] ;
if ( strcmp ( fuzzer , " html " ) = = 0 ) {
2021-02-22 23:26:13 +03:00
# ifdef HAVE_HTML_FUZZER
processArg = processPattern ;
2020-08-24 17:28:54 +03:00
globalData . processFile = processHtml ;
2024-05-13 13:18:08 +03:00
# endif
} else if ( strcmp ( fuzzer , " lint " ) = = 0 ) {
# ifdef HAVE_LINT_FUZZER
processArg = processPattern ;
globalData . flags | = FLAG_LINT ;
globalData . processFile = processXml ;
2024-04-16 14:24:12 +03:00
# endif
} else if ( strcmp ( fuzzer , " reader " ) = = 0 ) {
# ifdef HAVE_READER_FUZZER
processArg = processPattern ;
globalData . flags | = FLAG_READER ;
globalData . processFile = processXml ;
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
} else if ( strcmp ( fuzzer , " schema " ) = = 0 ) {
2021-02-22 23:26:13 +03:00
# ifdef HAVE_SCHEMA_FUZZER
processArg = processPattern ;
2020-08-24 17:28:54 +03:00
globalData . processFile = processSchema ;
2023-03-12 18:15:54 +03:00
# endif
} else if ( strcmp ( fuzzer , " valid " ) = = 0 ) {
2023-11-27 20:03:01 +03:00
# ifdef HAVE_VALID_FUZZER
2023-03-12 18:15:54 +03:00
processArg = processPattern ;
globalData . processFile = processXml ;
2022-12-26 19:49:27 +03:00
# endif
} else if ( strcmp ( fuzzer , " xinclude " ) = = 0 ) {
# ifdef HAVE_XINCLUDE_FUZZER
processArg = processPattern ;
globalData . processFile = processXml ;
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
} else if ( strcmp ( fuzzer , " xml " ) = = 0 ) {
2021-02-22 23:26:13 +03:00
# ifdef HAVE_XML_FUZZER
processArg = processPattern ;
2020-08-24 17:28:54 +03:00
globalData . processFile = processXml ;
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
} else if ( strcmp ( fuzzer , " xpath " ) = = 0 ) {
2021-02-22 23:26:13 +03:00
# ifdef HAVE_XPATH_FUZZER
2020-08-24 17:28:54 +03:00
processArg = processXPathDir ;
2021-02-22 23:26:13 +03:00
# endif
2020-08-24 17:28:54 +03:00
} else {
fprintf ( stderr , " unknown fuzzer %s \n " , fuzzer ) ;
return ( 1 ) ;
}
globalData . fuzzer = fuzzer ;
if ( getcwd ( globalData . cwd , PATH_SIZE ) = = NULL ) {
fprintf ( stderr , " couldn't get current directory \n " ) ;
return ( 1 ) ;
}
2021-02-22 23:26:13 +03:00
if ( processArg ! = NULL )
for ( i = 2 ; i < argc ; i + + )
processArg ( argv [ i ] ) ;
2020-08-24 17:28:54 +03:00
return ( ret ) ;
}