2001-02-23 20:55:21 +03:00
/*
2001-12-31 19:16:02 +03:00
* parserInternals . c : Internal routines ( and obsolete ones ) needed for the
* XML and HTML parsers .
2001-02-23 20:55:21 +03:00
*
* See Copyright for the status of this software .
*
2001-06-24 16:13:24 +04:00
* daniel @ veillard . com
2001-02-23 20:55:21 +03:00
*/
2002-03-18 22:37:11 +03:00
# define IN_LIBXML
2001-04-21 20:57:29 +04:00
# include "libxml.h"
2022-03-01 00:42:10 +03:00
# if defined(_WIN32)
2001-02-23 20:55:21 +03:00
# define XML_DIR_SEP '\\'
# else
# define XML_DIR_SEP ' / '
# endif
# include <string.h>
# include <ctype.h>
# include <stdlib.h>
2022-03-02 02:29:17 +03:00
2001-02-23 20:55:21 +03:00
# include <libxml/xmlmemory.h>
# include <libxml/tree.h>
# include <libxml/parser.h>
# include <libxml/parserInternals.h>
# include <libxml/valid.h>
# include <libxml/entities.h>
# include <libxml/xmlerror.h>
# include <libxml/encoding.h>
# include <libxml/valid.h>
# include <libxml/xmlIO.h>
# include <libxml/uri.h>
2003-08-18 16:15:38 +04:00
# include <libxml/dict.h>
2001-09-14 14:29:27 +04:00
# include <libxml/SAX.h>
2001-08-22 18:29:45 +04:00
# ifdef LIBXML_CATALOG_ENABLED
# include <libxml/catalog.h>
# endif
2001-10-17 19:58:35 +04:00
# include <libxml/globals.h>
2003-10-11 19:22:13 +04:00
# include <libxml/chvalid.h>
2001-02-23 20:55:21 +03:00
2016-03-02 02:18:04 +03:00
# define CUR(ctxt) ctxt->input->cur
# define END(ctxt) ctxt->input->end
# define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
2022-08-26 02:22:33 +03:00
# include "private/buf.h"
# include "private/enc.h"
# include "private/error.h"
# include "private/io.h"
# include "private/parser.h"
2012-07-16 10:19:49 +04:00
2001-07-25 21:18:57 +04:00
/*
* Various global defaults for parsing
*/
2001-02-23 20:55:21 +03:00
2001-07-18 23:30:27 +04:00
/**
2001-02-23 20:55:21 +03:00
* xmlCheckVersion :
* @ version : the include version number
*
* check the compiled lib version against the include one .
* This can warn or immediately kill the application
*/
void
xmlCheckVersion ( int version ) {
2022-09-01 02:18:30 +03:00
int myversion = LIBXML_VERSION ;
2001-02-23 20:55:21 +03:00
2001-10-14 13:56:15 +04:00
xmlInitParser ( ) ;
2001-05-08 00:50:47 +04:00
2001-02-23 20:55:21 +03:00
if ( ( myversion / 10000 ) ! = ( version / 10000 ) ) {
2012-09-11 09:26:36 +04:00
xmlGenericError ( xmlGenericErrorContext ,
2001-02-23 20:55:21 +03:00
" Fatal: program compiled against libxml %d using libxml %d \n " ,
( version / 10000 ) , ( myversion / 10000 ) ) ;
2012-09-11 09:26:36 +04:00
fprintf ( stderr ,
2001-11-20 11:35:07 +03:00
" Fatal: program compiled against libxml %d using libxml %d \n " ,
( version / 10000 ) , ( myversion / 10000 ) ) ;
2001-02-23 20:55:21 +03:00
}
if ( ( myversion / 100 ) < ( version / 100 ) ) {
2012-09-11 09:26:36 +04:00
xmlGenericError ( xmlGenericErrorContext ,
2001-02-23 20:55:21 +03:00
" Warning: program compiled against libxml %d using older %d \n " ,
( version / 100 ) , ( myversion / 100 ) ) ;
}
}
2003-10-06 01:33:18 +04:00
/************************************************************************
* *
2012-09-11 09:26:36 +04:00
* Some factorized error routines *
2003-10-06 01:33:18 +04:00
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* xmlErrMemory :
* @ ctxt : an XML parser context
2020-03-08 19:19:42 +03:00
* @ extra : extra information
2003-10-06 01:33:18 +04:00
*
* Handle a redefinition of attribute error
*/
void
xmlErrMemory ( xmlParserCtxtPtr ctxt , const char * extra )
{
2003-10-31 13:36:03 +03:00
if ( ( ctxt ! = NULL ) & & ( ctxt - > disableSAX ! = 0 ) & &
( ctxt - > instate = = XML_PARSER_EOF ) )
return ;
2003-10-06 01:33:18 +04:00
if ( ctxt ! = NULL ) {
ctxt - > errNo = XML_ERR_NO_MEMORY ;
ctxt - > instate = XML_PARSER_EOF ;
ctxt - > disableSAX = 1 ;
}
if ( extra )
2003-10-10 18:10:40 +04:00
__xmlRaiseError ( NULL , NULL , NULL , ctxt , NULL , XML_FROM_PARSER ,
2003-10-06 01:33:18 +04:00
XML_ERR_NO_MEMORY , XML_ERR_FATAL , NULL , 0 , extra ,
NULL , NULL , 0 , 0 ,
" Memory allocation failed : %s \n " , extra ) ;
else
2003-10-10 18:10:40 +04:00
__xmlRaiseError ( NULL , NULL , NULL , ctxt , NULL , XML_FROM_PARSER ,
2003-10-06 01:33:18 +04:00
XML_ERR_NO_MEMORY , XML_ERR_FATAL , NULL , 0 , NULL ,
NULL , NULL , 0 , 0 , " Memory allocation failed \n " ) ;
}
/**
2003-10-19 17:35:37 +04:00
* __xmlErrEncoding :
2003-10-06 01:33:18 +04:00
* @ ctxt : an XML parser context
2004-08-16 04:39:03 +04:00
* @ xmlerr : the error number
2003-10-06 01:33:18 +04:00
* @ msg : the error message
* @ str1 : an string info
* @ str2 : an string info
*
* Handle an encoding error
*/
2003-10-19 17:35:37 +04:00
void
2004-08-16 04:39:03 +04:00
__xmlErrEncoding ( xmlParserCtxtPtr ctxt , xmlParserErrors xmlerr ,
2003-10-19 17:35:37 +04:00
const char * msg , const xmlChar * str1 , const xmlChar * str2 )
2003-10-06 01:33:18 +04:00
{
2003-10-31 13:36:03 +03:00
if ( ( ctxt ! = NULL ) & & ( ctxt - > disableSAX ! = 0 ) & &
( ctxt - > instate = = XML_PARSER_EOF ) )
return ;
2003-10-06 01:33:18 +04:00
if ( ctxt ! = NULL )
2004-08-16 04:39:03 +04:00
ctxt - > errNo = xmlerr ;
2003-10-10 18:10:40 +04:00
__xmlRaiseError ( NULL , NULL , NULL ,
2004-08-16 04:39:03 +04:00
ctxt , NULL , XML_FROM_PARSER , xmlerr , XML_ERR_FATAL ,
2003-10-06 01:33:18 +04:00
NULL , 0 , ( const char * ) str1 , ( const char * ) str2 ,
NULL , 0 , 0 , msg , str1 , str2 ) ;
if ( ctxt ! = NULL ) {
ctxt - > wellFormed = 0 ;
if ( ctxt - > recovery = = 0 )
ctxt - > disableSAX = 1 ;
}
}
/**
* xmlErrInternal :
* @ ctxt : an XML parser context
* @ msg : the error message
2020-03-08 19:19:42 +03:00
* @ str : error information
2003-10-06 01:33:18 +04:00
*
* Handle an internal error
*/
2016-05-13 10:13:17 +03:00
static void LIBXML_ATTR_FORMAT ( 2 , 0 )
2003-10-06 01:33:18 +04:00
xmlErrInternal ( xmlParserCtxtPtr ctxt , const char * msg , const xmlChar * str )
{
2003-10-31 13:36:03 +03:00
if ( ( ctxt ! = NULL ) & & ( ctxt - > disableSAX ! = 0 ) & &
( ctxt - > instate = = XML_PARSER_EOF ) )
return ;
2003-10-06 01:33:18 +04:00
if ( ctxt ! = NULL )
ctxt - > errNo = XML_ERR_INTERNAL_ERROR ;
2003-10-10 18:10:40 +04:00
__xmlRaiseError ( NULL , NULL , NULL ,
2003-10-06 01:33:18 +04:00
ctxt , NULL , XML_FROM_PARSER , XML_ERR_INTERNAL_ERROR ,
XML_ERR_FATAL , NULL , 0 , ( const char * ) str , NULL , NULL ,
0 , 0 , msg , str ) ;
if ( ctxt ! = NULL ) {
ctxt - > wellFormed = 0 ;
if ( ctxt - > recovery = = 0 )
ctxt - > disableSAX = 1 ;
}
}
2023-04-30 18:51:29 +03:00
/**
* xmlFatalErr :
* @ ctxt : an XML parser context
* @ error : the error number
* @ extra : extra information string
*
* Handle a fatal parser error , i . e . violating Well - Formedness constraints
*/
void
xmlFatalErr ( xmlParserCtxtPtr ctxt , xmlParserErrors error , const char * info )
{
const char * errmsg ;
if ( ( ctxt ! = NULL ) & & ( ctxt - > disableSAX ! = 0 ) & &
( ctxt - > instate = = XML_PARSER_EOF ) )
return ;
switch ( error ) {
case XML_ERR_INVALID_HEX_CHARREF :
errmsg = " CharRef: invalid hexadecimal value " ;
break ;
case XML_ERR_INVALID_DEC_CHARREF :
errmsg = " CharRef: invalid decimal value " ;
break ;
case XML_ERR_INVALID_CHARREF :
errmsg = " CharRef: invalid value " ;
break ;
case XML_ERR_INTERNAL_ERROR :
errmsg = " internal error " ;
break ;
case XML_ERR_PEREF_AT_EOF :
errmsg = " PEReference at end of document " ;
break ;
case XML_ERR_PEREF_IN_PROLOG :
errmsg = " PEReference in prolog " ;
break ;
case XML_ERR_PEREF_IN_EPILOG :
errmsg = " PEReference in epilog " ;
break ;
case XML_ERR_PEREF_NO_NAME :
errmsg = " PEReference: no name " ;
break ;
case XML_ERR_PEREF_SEMICOL_MISSING :
errmsg = " PEReference: expecting ';' " ;
break ;
case XML_ERR_ENTITY_LOOP :
errmsg = " Detected an entity reference loop " ;
break ;
case XML_ERR_ENTITY_NOT_STARTED :
errmsg = " EntityValue: \" or ' expected " ;
break ;
case XML_ERR_ENTITY_PE_INTERNAL :
errmsg = " PEReferences forbidden in internal subset " ;
break ;
case XML_ERR_ENTITY_NOT_FINISHED :
errmsg = " EntityValue: \" or ' expected " ;
break ;
case XML_ERR_ATTRIBUTE_NOT_STARTED :
errmsg = " AttValue: \" or ' expected " ;
break ;
case XML_ERR_LT_IN_ATTRIBUTE :
errmsg = " Unescaped '<' not allowed in attributes values " ;
break ;
case XML_ERR_LITERAL_NOT_STARTED :
errmsg = " SystemLiteral \" or ' expected " ;
break ;
case XML_ERR_LITERAL_NOT_FINISHED :
errmsg = " Unfinished System or Public ID \" or ' expected " ;
break ;
case XML_ERR_MISPLACED_CDATA_END :
errmsg = " Sequence ']]>' not allowed in content " ;
break ;
case XML_ERR_URI_REQUIRED :
errmsg = " SYSTEM or PUBLIC, the URI is missing " ;
break ;
case XML_ERR_PUBID_REQUIRED :
errmsg = " PUBLIC, the Public Identifier is missing " ;
break ;
case XML_ERR_HYPHEN_IN_COMMENT :
errmsg = " Comment must not contain '--' (double-hyphen) " ;
break ;
case XML_ERR_PI_NOT_STARTED :
errmsg = " xmlParsePI : no target name " ;
break ;
case XML_ERR_RESERVED_XML_NAME :
errmsg = " Invalid PI name " ;
break ;
case XML_ERR_NOTATION_NOT_STARTED :
errmsg = " NOTATION: Name expected here " ;
break ;
case XML_ERR_NOTATION_NOT_FINISHED :
errmsg = " '>' required to close NOTATION declaration " ;
break ;
case XML_ERR_VALUE_REQUIRED :
errmsg = " Entity value required " ;
break ;
case XML_ERR_URI_FRAGMENT :
errmsg = " Fragment not allowed " ;
break ;
case XML_ERR_ATTLIST_NOT_STARTED :
errmsg = " '(' required to start ATTLIST enumeration " ;
break ;
case XML_ERR_NMTOKEN_REQUIRED :
errmsg = " NmToken expected in ATTLIST enumeration " ;
break ;
case XML_ERR_ATTLIST_NOT_FINISHED :
errmsg = " ')' required to finish ATTLIST enumeration " ;
break ;
case XML_ERR_MIXED_NOT_STARTED :
errmsg = " MixedContentDecl : '|' or ')*' expected " ;
break ;
case XML_ERR_PCDATA_REQUIRED :
errmsg = " MixedContentDecl : '#PCDATA' expected " ;
break ;
case XML_ERR_ELEMCONTENT_NOT_STARTED :
errmsg = " ContentDecl : Name or '(' expected " ;
break ;
case XML_ERR_ELEMCONTENT_NOT_FINISHED :
errmsg = " ContentDecl : ',' '|' or ')' expected " ;
break ;
case XML_ERR_PEREF_IN_INT_SUBSET :
errmsg =
" PEReference: forbidden within markup decl in internal subset " ;
break ;
case XML_ERR_GT_REQUIRED :
errmsg = " expected '>' " ;
break ;
case XML_ERR_CONDSEC_INVALID :
errmsg = " XML conditional section '[' expected " ;
break ;
case XML_ERR_EXT_SUBSET_NOT_FINISHED :
errmsg = " Content error in the external subset " ;
break ;
case XML_ERR_CONDSEC_INVALID_KEYWORD :
errmsg =
" conditional section INCLUDE or IGNORE keyword expected " ;
break ;
case XML_ERR_CONDSEC_NOT_FINISHED :
errmsg = " XML conditional section not closed " ;
break ;
case XML_ERR_XMLDECL_NOT_STARTED :
errmsg = " Text declaration '<?xml' required " ;
break ;
case XML_ERR_XMLDECL_NOT_FINISHED :
errmsg = " parsing XML declaration: '?>' expected " ;
break ;
case XML_ERR_EXT_ENTITY_STANDALONE :
errmsg = " external parsed entities cannot be standalone " ;
break ;
case XML_ERR_ENTITYREF_SEMICOL_MISSING :
errmsg = " EntityRef: expecting ';' " ;
break ;
case XML_ERR_DOCTYPE_NOT_FINISHED :
errmsg = " DOCTYPE improperly terminated " ;
break ;
case XML_ERR_LTSLASH_REQUIRED :
errmsg = " EndTag: '</' not found " ;
break ;
case XML_ERR_EQUAL_REQUIRED :
errmsg = " expected '=' " ;
break ;
case XML_ERR_STRING_NOT_CLOSED :
errmsg = " String not closed expecting \" or ' " ;
break ;
case XML_ERR_STRING_NOT_STARTED :
errmsg = " String not started expecting ' or \" " ;
break ;
case XML_ERR_ENCODING_NAME :
errmsg = " Invalid XML encoding name " ;
break ;
case XML_ERR_STANDALONE_VALUE :
errmsg = " standalone accepts only 'yes' or 'no' " ;
break ;
case XML_ERR_DOCUMENT_EMPTY :
errmsg = " Document is empty " ;
break ;
case XML_ERR_DOCUMENT_END :
errmsg = " Extra content at the end of the document " ;
break ;
case XML_ERR_NOT_WELL_BALANCED :
errmsg = " chunk is not well balanced " ;
break ;
case XML_ERR_EXTRA_CONTENT :
errmsg = " extra content at the end of well balanced chunk " ;
break ;
case XML_ERR_VERSION_MISSING :
errmsg = " Malformed declaration expecting version " ;
break ;
case XML_ERR_NAME_TOO_LONG :
errmsg = " Name too long " ;
break ;
case XML_ERR_INVALID_ENCODING :
errmsg = " Invalid bytes in character encoding " ;
break ;
case XML_IO_UNKNOWN :
errmsg = " I/O error " ;
break ;
#if 0
case :
errmsg = " " ;
break ;
# endif
default :
errmsg = " Unregistered error message " ;
}
if ( ctxt ! = NULL )
ctxt - > errNo = error ;
if ( info = = NULL ) {
__xmlRaiseError ( NULL , NULL , NULL , ctxt , NULL , XML_FROM_PARSER , error ,
XML_ERR_FATAL , NULL , 0 , info , NULL , NULL , 0 , 0 , " %s \n " ,
errmsg ) ;
} else {
__xmlRaiseError ( NULL , NULL , NULL , ctxt , NULL , XML_FROM_PARSER , error ,
XML_ERR_FATAL , NULL , 0 , info , NULL , NULL , 0 , 0 , " %s: %s \n " ,
errmsg , info ) ;
}
if ( ctxt ! = NULL ) {
ctxt - > wellFormed = 0 ;
if ( ctxt - > recovery = = 0 )
ctxt - > disableSAX = 1 ;
}
}
2003-10-06 01:33:18 +04:00
/**
* xmlErrEncodingInt :
* @ ctxt : an XML parser context
* @ error : the error number
* @ msg : the error message
* @ val : an integer value
*
* n encoding error
*/
2016-05-13 10:13:17 +03:00
static void LIBXML_ATTR_FORMAT ( 3 , 0 )
2003-10-06 01:33:18 +04:00
xmlErrEncodingInt ( xmlParserCtxtPtr ctxt , xmlParserErrors error ,
const char * msg , int val )
{
2003-10-31 13:36:03 +03:00
if ( ( ctxt ! = NULL ) & & ( ctxt - > disableSAX ! = 0 ) & &
( ctxt - > instate = = XML_PARSER_EOF ) )
return ;
2003-10-06 01:33:18 +04:00
if ( ctxt ! = NULL )
ctxt - > errNo = error ;
2003-10-10 18:10:40 +04:00
__xmlRaiseError ( NULL , NULL , NULL ,
2003-10-06 01:33:18 +04:00
ctxt , NULL , XML_FROM_PARSER , error , XML_ERR_FATAL ,
NULL , 0 , NULL , NULL , NULL , val , 0 , msg , val ) ;
if ( ctxt ! = NULL ) {
ctxt - > wellFormed = 0 ;
if ( ctxt - > recovery = = 0 )
ctxt - > disableSAX = 1 ;
}
}
2001-02-23 20:55:21 +03:00
/**
* xmlIsLetter :
* @ c : an unicode character ( int )
*
* Check whether the character is allowed by the production
* [ 84 ] Letter : : = BaseChar | Ideographic
*
* Returns 0 if not , non - zero otherwise
*/
int
xmlIsLetter ( int c ) {
return ( IS_BASECHAR ( c ) | | IS_IDEOGRAPHIC ( c ) ) ;
}
/************************************************************************
* *
2012-07-16 10:19:49 +04:00
* Input handling functions for progressive parsing *
2001-02-23 20:55:21 +03:00
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* #define DEBUG_INPUT */
/* #define DEBUG_STACK */
/* #define DEBUG_PUSH */
/* we need to keep enough input to show errors in context */
# define LINE_LEN 80
# ifdef DEBUG_INPUT
# define CHECK_BUFFER(in) check_buffer(in)
2002-12-10 18:19:08 +03:00
static
2001-02-23 20:55:21 +03:00
void check_buffer ( xmlParserInputPtr in ) {
2012-07-16 10:19:49 +04:00
if ( in - > base ! = xmlBufContent ( in - > buf - > buffer ) ) {
2001-02-23 20:55:21 +03:00
xmlGenericError ( xmlGenericErrorContext ,
" xmlParserInput: base mismatch problem \n " ) ;
}
if ( in - > cur < in - > base ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlParserInput: cur < base problem \n " ) ;
}
2012-07-16 10:19:49 +04:00
if ( in - > cur > in - > base + xmlBufUse ( in - > buf - > buffer ) ) {
2001-02-23 20:55:21 +03:00
xmlGenericError ( xmlGenericErrorContext ,
" xmlParserInput: cur > base + use problem \n " ) ;
}
2022-09-01 02:18:30 +03:00
xmlGenericError ( xmlGenericErrorContext , " buffer %p : content %x, cur %d, use %d \n " ,
( void * ) in , ( int ) xmlBufContent ( in - > buf - > buffer ) ,
in - > cur - in - > base , xmlBufUse ( in - > buf - > buffer ) ) ;
2001-02-23 20:55:21 +03:00
}
# else
2012-07-16 10:19:49 +04:00
# define CHECK_BUFFER(in)
2001-02-23 20:55:21 +03:00
# endif
2023-03-14 16:42:36 +03:00
/**
* xmlHaltParser :
* @ ctxt : an XML parser context
*
* Blocks further parser processing don ' t override error
* for internal use
*/
void
xmlHaltParser ( xmlParserCtxtPtr ctxt ) {
if ( ctxt = = NULL )
return ;
ctxt - > instate = XML_PARSER_EOF ;
ctxt - > disableSAX = 1 ;
while ( ctxt - > inputNr > 1 )
xmlFreeInputStream ( inputPop ( ctxt ) ) ;
if ( ctxt - > input ! = NULL ) {
/*
* in case there was a specific allocation deallocate before
* overriding base
*/
if ( ctxt - > input - > free ! = NULL ) {
ctxt - > input - > free ( ( xmlChar * ) ctxt - > input - > base ) ;
ctxt - > input - > free = NULL ;
}
if ( ctxt - > input - > buf ! = NULL ) {
xmlFreeParserInputBuffer ( ctxt - > input - > buf ) ;
ctxt - > input - > buf = NULL ;
}
ctxt - > input - > cur = BAD_CAST " " ;
ctxt - > input - > length = 0 ;
ctxt - > input - > base = ctxt - > input - > cur ;
ctxt - > input - > end = ctxt - > input - > cur ;
}
}
2001-02-23 20:55:21 +03:00
/**
* xmlParserInputRead :
* @ in : an XML parser input
* @ len : an indicative size for the lookahead
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : This function was internal and is deprecated .
2001-02-23 20:55:21 +03:00
*
2012-07-16 10:19:49 +04:00
* Returns - 1 as this is an error to use it .
2001-02-23 20:55:21 +03:00
*/
int
2012-07-16 10:19:49 +04:00
xmlParserInputRead ( xmlParserInputPtr in ATTRIBUTE_UNUSED , int len ATTRIBUTE_UNUSED ) {
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
2023-03-12 18:47:15 +03:00
/**
* xmlParserGrow :
* @ ctxt : an XML parser context
*/
int
xmlParserGrow ( xmlParserCtxtPtr ctxt ) {
xmlParserInputPtr in = ctxt - > input ;
xmlParserInputBufferPtr buf = in - > buf ;
ptrdiff_t curEnd = in - > end - in - > cur ;
ptrdiff_t curBase = in - > cur - in - > base ;
int ret ;
if ( buf = = NULL )
return ( 0 ) ;
2023-04-12 14:43:28 +03:00
/* Don't grow push parser buffer. */
if ( ctxt - > progressive )
return ( 0 ) ;
2023-06-07 15:05:34 +03:00
if ( buf - > error ! = 0 )
return ( - 1 ) ;
2023-03-12 18:47:15 +03:00
if ( ( ( curEnd > XML_MAX_LOOKUP_LIMIT ) | |
( curBase > XML_MAX_LOOKUP_LIMIT ) ) & &
( ( ctxt - > options & XML_PARSE_HUGE ) = = 0 ) ) {
2023-04-30 19:25:09 +03:00
xmlErrMemory ( ctxt , " Huge input lookup " ) ;
2023-03-16 19:48:57 +03:00
xmlHaltParser ( ctxt ) ;
2023-03-12 18:47:15 +03:00
return ( - 1 ) ;
}
if ( curEnd > = INPUT_CHUNK )
return ( 0 ) ;
ret = xmlParserInputBufferGrow ( buf , INPUT_CHUNK ) ;
2023-08-08 16:21:14 +03:00
xmlBufUpdateInput ( buf - > buffer , in , curBase ) ;
2023-03-12 18:47:15 +03:00
2023-06-08 22:53:05 +03:00
if ( ret < 0 ) {
2023-04-30 19:25:09 +03:00
xmlFatalErr ( ctxt , buf - > error , NULL ) ;
2023-06-08 22:53:05 +03:00
/* Buffer contents may be lost in case of memory errors. */
if ( buf - > error = = XML_ERR_NO_MEMORY )
xmlHaltParser ( ctxt ) ;
}
2023-03-12 18:47:15 +03:00
return ( ret ) ;
}
2001-02-23 20:55:21 +03:00
/**
* xmlParserInputGrow :
* @ in : an XML parser input
* @ len : an indicative size for the lookahead
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* This function increase the input for the parser . It tries to
* preserve pointers to the input buffer , and keep already read data
*
2012-07-16 10:19:49 +04:00
* Returns the amount of char read , or - 1 in case of error , 0 indicate the
2001-02-23 20:55:21 +03:00
* end of this entity
*/
int
xmlParserInputGrow ( xmlParserInputPtr in , int len ) {
2016-05-19 00:52:59 +03:00
int ret ;
2012-07-16 10:19:49 +04:00
size_t indx ;
2001-02-23 20:55:21 +03:00
2012-07-16 10:19:49 +04:00
if ( ( in = = NULL ) | | ( len < 0 ) ) return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
# ifdef DEBUG_INPUT
xmlGenericError ( xmlGenericErrorContext , " Grow \n " ) ;
# endif
if ( in - > buf = = NULL ) return ( - 1 ) ;
if ( in - > base = = NULL ) return ( - 1 ) ;
if ( in - > cur = = NULL ) return ( - 1 ) ;
if ( in - > buf - > buffer = = NULL ) return ( - 1 ) ;
CHECK_BUFFER ( in ) ;
2001-03-24 20:00:36 +03:00
indx = in - > cur - in - > base ;
2012-07-16 10:19:49 +04:00
if ( xmlBufUse ( in - > buf - > buffer ) > ( unsigned int ) indx + INPUT_CHUNK ) {
2001-02-23 20:55:21 +03:00
CHECK_BUFFER ( in ) ;
return ( 0 ) ;
}
2022-11-13 18:56:10 +03:00
ret = xmlParserInputBufferGrow ( in - > buf , len ) ;
2001-02-23 20:55:21 +03:00
2022-03-06 04:29:00 +03:00
in - > base = xmlBufContent ( in - > buf - > buffer ) ;
2023-01-22 16:52:06 +03:00
if ( in - > base = = NULL ) {
in - > base = BAD_CAST " " ;
in - > cur = in - > base ;
in - > end = in - > base ;
return ( - 1 ) ;
}
2022-03-06 04:29:00 +03:00
in - > cur = in - > base + indx ;
2012-07-16 10:19:49 +04:00
in - > end = xmlBufEnd ( in - > buf - > buffer ) ;
2001-02-23 20:55:21 +03:00
CHECK_BUFFER ( in ) ;
return ( ret ) ;
}
2023-03-13 19:51:13 +03:00
/**
* xmlParserShrink :
* @ ctxt : an XML parser context
*/
2023-03-21 15:08:44 +03:00
void
2023-03-13 19:51:13 +03:00
xmlParserShrink ( xmlParserCtxtPtr ctxt ) {
xmlParserInputPtr in = ctxt - > input ;
xmlParserInputBufferPtr buf = in - > buf ;
size_t used ;
2023-05-09 14:28:06 +03:00
/* Don't shrink pull parser memory buffers. */
2023-03-13 19:51:13 +03:00
if ( ( buf = = NULL ) | |
2023-05-09 14:28:06 +03:00
( ( ctxt - > progressive = = 0 ) & &
( buf - > encoder = = NULL ) & & ( buf - > readcallback = = NULL ) ) )
2023-03-21 15:08:44 +03:00
return ;
2023-03-13 19:51:13 +03:00
used = in - > cur - in - > base ;
/*
* Do not shrink on large buffers whose only a tiny fraction
* was consumed
*/
if ( used > INPUT_CHUNK ) {
size_t res = xmlBufShrink ( buf - > buffer , used - LINE_LEN ) ;
if ( res > 0 ) {
used - = res ;
if ( ( res > ULONG_MAX ) | |
( in - > consumed > ULONG_MAX - ( unsigned long ) res ) )
in - > consumed = ULONG_MAX ;
else
in - > consumed + = res ;
}
}
2023-08-08 16:21:14 +03:00
xmlBufUpdateInput ( buf - > buffer , in , used ) ;
2023-03-13 19:51:13 +03:00
}
2001-02-23 20:55:21 +03:00
/**
* xmlParserInputShrink :
* @ in : an XML parser input
*
2023-03-13 21:19:46 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* This function removes used input for the parser .
*/
void
xmlParserInputShrink ( xmlParserInputPtr in ) {
2012-07-16 10:19:49 +04:00
size_t used ;
size_t ret ;
2001-02-23 20:55:21 +03:00
# ifdef DEBUG_INPUT
xmlGenericError ( xmlGenericErrorContext , " Shrink \n " ) ;
# endif
2004-11-08 17:02:18 +03:00
if ( in = = NULL ) return ;
2001-02-23 20:55:21 +03:00
if ( in - > buf = = NULL ) return ;
if ( in - > base = = NULL ) return ;
if ( in - > cur = = NULL ) return ;
if ( in - > buf - > buffer = = NULL ) return ;
CHECK_BUFFER ( in ) ;
2022-03-06 04:29:00 +03:00
used = in - > cur - in - > base ;
2001-02-23 20:55:21 +03:00
/*
* Do not shrink on large buffers whose only a tiny fraction
2001-12-31 19:16:02 +03:00
* was consumed
2001-02-23 20:55:21 +03:00
*/
if ( used > INPUT_CHUNK ) {
2012-07-16 10:19:49 +04:00
ret = xmlBufShrink ( in - > buf - > buffer , used - LINE_LEN ) ;
2001-02-23 20:55:21 +03:00
if ( ret > 0 ) {
2022-03-06 04:29:00 +03:00
used - = ret ;
2022-11-13 22:19:13 +03:00
if ( ( ret > ULONG_MAX ) | |
( in - > consumed > ULONG_MAX - ( unsigned long ) ret ) )
in - > consumed = ULONG_MAX ;
else
in - > consumed + = ret ;
2001-02-23 20:55:21 +03:00
}
}
2022-03-06 04:29:00 +03:00
if ( xmlBufUse ( in - > buf - > buffer ) < = INPUT_CHUNK ) {
xmlParserInputBufferRead ( in - > buf , 2 * INPUT_CHUNK ) ;
2001-02-23 20:55:21 +03:00
}
2022-03-06 04:29:00 +03:00
in - > base = xmlBufContent ( in - > buf - > buffer ) ;
2023-03-13 18:51:14 +03:00
if ( in - > base = = NULL ) {
/* TODO: raise error */
in - > base = BAD_CAST " " ;
in - > cur = in - > base ;
in - > end = in - > base ;
return ;
}
2022-03-06 04:29:00 +03:00
in - > cur = in - > base + used ;
2012-07-16 10:19:49 +04:00
in - > end = xmlBufEnd ( in - > buf - > buffer ) ;
2001-02-23 20:55:21 +03:00
CHECK_BUFFER ( in ) ;
}
/************************************************************************
* *
2012-09-11 09:26:36 +04:00
* UTF8 character input and related functions *
2001-02-23 20:55:21 +03:00
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* xmlNextChar :
* @ ctxt : the XML parser context
*
2023-03-13 21:38:41 +03:00
* DEPRECATED : Internal function , do not use .
*
2001-02-23 20:55:21 +03:00
* Skip to the next char input char .
*/
void
2003-03-22 03:04:05 +03:00
xmlNextChar ( xmlParserCtxtPtr ctxt )
{
2004-11-08 17:02:18 +03:00
if ( ( ctxt = = NULL ) | | ( ctxt - > instate = = XML_PARSER_EOF ) | |
( ctxt - > input = = NULL ) )
2003-03-22 03:04:05 +03:00
return ;
2001-02-23 20:55:21 +03:00
2016-03-02 02:18:04 +03:00
if ( ! ( VALID_CTXT ( ctxt ) ) ) {
xmlErrInternal ( ctxt , " Parser input data memory error \n " , NULL ) ;
ctxt - > errNo = XML_ERR_INTERNAL_ERROR ;
xmlStopParser ( ctxt ) ;
return ;
}
2023-03-21 15:26:36 +03:00
if ( ctxt - > input - > end - ctxt - > input - > cur < INPUT_CHUNK ) {
2023-06-07 15:05:34 +03:00
xmlParserGrow ( ctxt ) ;
if ( ( ctxt - > instate = = XML_PARSER_EOF ) | |
( ctxt - > input - > cur > = ctxt - > input - > end ) )
2023-03-15 18:18:11 +03:00
return ;
2016-03-02 02:18:04 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt - > input - > flags & XML_INPUT_8_BIT ) = = 0 ) {
2016-03-02 02:18:04 +03:00
const unsigned char * cur ;
unsigned char c ;
/*
* 2.11 End - of - Line Handling
* the literal two - character sequence " #xD#xA " or a standalone
* literal # xD , an XML processor must pass to the application
* the single character # xA .
*/
if ( * ( ctxt - > input - > cur ) = = ' \n ' ) {
ctxt - > input - > line + + ; ctxt - > input - > col = 1 ;
} else
ctxt - > input - > col + + ;
/*
* We are supposed to handle UTF8 , check it ' s valid
* From rfc2044 : encoding of the Unicode values on UTF - 8 :
*
* UCS - 4 range ( hex . ) UTF - 8 octet sequence ( binary )
* 0000 0000 - 0000 007F 0 xxxxxxx
* 0000 00 80 - 0000 07FF 110 xxxxx 10 xxxxxx
* 0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx
*
* Check for the 0x110000 limit too
*/
cur = ctxt - > input - > cur ;
c = * cur ;
if ( c & 0x80 ) {
2023-03-15 18:18:11 +03:00
size_t avail ;
2017-08-28 21:40:19 +03:00
if ( c = = 0xC0 )
goto encoding_error ;
2023-03-15 18:18:11 +03:00
avail = ctxt - > input - > end - ctxt - > input - > cur ;
if ( ( avail < 2 ) | | ( cur [ 1 ] & 0xc0 ) ! = 0x80 )
2016-03-02 02:18:04 +03:00
goto encoding_error ;
if ( ( c & 0xe0 ) = = 0xe0 ) {
unsigned int val ;
2023-03-15 18:18:11 +03:00
if ( ( avail < 3 ) | | ( cur [ 2 ] & 0xc0 ) ! = 0x80 )
2003-03-22 03:04:05 +03:00
goto encoding_error ;
2016-03-02 02:18:04 +03:00
if ( ( c & 0xf0 ) = = 0xf0 ) {
if ( ( ( c & 0xf8 ) ! = 0xf0 ) | |
2023-03-15 18:18:11 +03:00
( avail < 4 ) | | ( ( cur [ 3 ] & 0xc0 ) ! = 0x80 ) )
2003-03-22 03:04:05 +03:00
goto encoding_error ;
2016-03-02 02:18:04 +03:00
/* 4-byte code */
ctxt - > input - > cur + = 4 ;
val = ( cur [ 0 ] & 0x7 ) < < 18 ;
val | = ( cur [ 1 ] & 0x3f ) < < 12 ;
val | = ( cur [ 2 ] & 0x3f ) < < 6 ;
val | = cur [ 3 ] & 0x3f ;
} else {
/* 3-byte code */
ctxt - > input - > cur + = 3 ;
val = ( cur [ 0 ] & 0xf ) < < 12 ;
val | = ( cur [ 1 ] & 0x3f ) < < 6 ;
val | = cur [ 2 ] & 0x3f ;
}
if ( ( ( val > 0xd7ff ) & & ( val < 0xe000 ) ) | |
( ( val > 0xfffd ) & & ( val < 0x10000 ) ) | |
( val > = 0x110000 ) ) {
xmlErrEncodingInt ( ctxt , XML_ERR_INVALID_CHAR ,
" Char 0x%X out of allowed range \n " ,
val ) ;
}
2003-03-22 03:04:05 +03:00
} else
2016-03-02 02:18:04 +03:00
/* 2-byte code */
ctxt - > input - > cur + = 2 ;
} else
/* 1-byte code */
ctxt - > input - > cur + + ;
2001-02-23 20:55:21 +03:00
} else {
2003-03-22 03:04:05 +03:00
/*
* Assume it ' s a fixed length encoding ( 1 ) with
* a compatible encoding for the ASCII set , since
* XML constructs only use < 128 chars
*/
if ( * ( ctxt - > input - > cur ) = = ' \n ' ) {
2005-01-05 18:37:55 +03:00
ctxt - > input - > line + + ; ctxt - > input - > col = 1 ;
2003-03-22 03:04:05 +03:00
} else
ctxt - > input - > col + + ;
ctxt - > input - > cur + + ;
2001-02-23 20:55:21 +03:00
}
return ;
2003-10-05 17:51:35 +04:00
encoding_error :
2001-02-23 20:55:21 +03:00
/*
* If we detect an UTF8 error that probably mean that the
2001-12-31 19:16:02 +03:00
* input encoding didn ' t get properly advertised in the
2001-02-23 20:55:21 +03:00
* declaration header . Report the error and switch the encoding
* to ISO - Latin - 1 ( if you don ' t like this policy , just declare the
* encoding ! )
*/
2004-11-09 17:59:59 +03:00
if ( ( ctxt = = NULL ) | | ( ctxt - > input = = NULL ) | |
( ctxt - > input - > end - ctxt - > input - > cur < 4 ) ) {
__xmlErrEncoding ( ctxt , XML_ERR_INVALID_CHAR ,
" Input is not proper UTF-8, indicate encoding ! \n " ,
NULL , NULL ) ;
} else {
char buffer [ 150 ] ;
snprintf ( buffer , 149 , " Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
ctxt - > input - > cur [ 0 ] , ctxt - > input - > cur [ 1 ] ,
ctxt - > input - > cur [ 2 ] , ctxt - > input - > cur [ 3 ] ) ;
__xmlErrEncoding ( ctxt , XML_ERR_INVALID_CHAR ,
" Input is not proper UTF-8, indicate encoding ! \n %s " ,
BAD_CAST buffer , NULL ) ;
2001-02-23 20:55:21 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt - > input - > flags & XML_INPUT_HAS_ENCODING ) = = 0 ) {
ctxt - > input - > flags | = XML_INPUT_HAS_ENCODING ;
ctxt - > input - > flags | = XML_INPUT_8_BIT ;
}
2002-03-21 00:55:57 +03:00
ctxt - > input - > cur + + ;
2001-02-23 20:55:21 +03:00
return ;
}
/**
* xmlCurrentChar :
* @ ctxt : the XML parser context
* @ len : pointer to the length of the char read
*
2023-03-13 21:38:41 +03:00
* DEPRECATED : Internal function , do not use .
*
2001-12-31 19:16:02 +03:00
* The current char value , if using UTF - 8 this may actually span multiple
2001-02-23 20:55:21 +03:00
* bytes in the input buffer . Implement the end of line normalization :
* 2.11 End - of - Line Handling
* Wherever an external parsed entity or the literal entity value
* of an internal parsed entity contains either the literal two - character
* sequence " #xD#xA " or a standalone literal # xD , an XML processor
* must pass to the application the single character # xA .
* This behavior can conveniently be produced by normalizing all
* line breaks to # xA on input , before parsing . )
*
2001-10-10 13:45:09 +04:00
* Returns the current char value and its length
2001-02-23 20:55:21 +03:00
*/
int
xmlCurrentChar ( xmlParserCtxtPtr ctxt , int * len ) {
2004-11-08 17:02:18 +03:00
if ( ( ctxt = = NULL ) | | ( len = = NULL ) | | ( ctxt - > input = = NULL ) ) return ( 0 ) ;
2001-02-23 20:55:21 +03:00
if ( ctxt - > instate = = XML_PARSER_EOF )
return ( 0 ) ;
2023-06-07 15:05:34 +03:00
if ( ctxt - > input - > end - ctxt - > input - > cur < INPUT_CHUNK ) {
xmlParserGrow ( ctxt ) ;
if ( ctxt - > instate = = XML_PARSER_EOF )
return ( 0 ) ;
}
2023-03-15 18:18:11 +03:00
2002-03-21 00:55:57 +03:00
if ( ( * ctxt - > input - > cur > = 0x20 ) & & ( * ctxt - > input - > cur < = 0x7F ) ) {
* len = 1 ;
2022-09-01 02:18:30 +03:00
return ( * ctxt - > input - > cur ) ;
2001-02-23 20:55:21 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt - > input - > flags & XML_INPUT_8_BIT ) = = 0 ) {
2001-02-23 20:55:21 +03:00
/*
* We are supposed to handle UTF8 , check it ' s valid
* From rfc2044 : encoding of the Unicode values on UTF - 8 :
*
* UCS - 4 range ( hex . ) UTF - 8 octet sequence ( binary )
* 0000 0000 - 0000 007F 0 xxxxxxx
* 0000 00 80 - 0000 07FF 110 xxxxx 10 xxxxxx
2012-09-11 09:26:36 +04:00
* 0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx
2001-02-23 20:55:21 +03:00
*
* Check for the 0x110000 limit too
*/
const unsigned char * cur = ctxt - > input - > cur ;
unsigned char c ;
unsigned int val ;
c = * cur ;
if ( c & 0x80 ) {
2023-03-15 18:18:11 +03:00
size_t avail ;
2008-01-11 11:00:57 +03:00
if ( ( ( c & 0x40 ) = = 0 ) | | ( c = = 0xC0 ) )
2003-05-20 16:22:41 +04:00
goto encoding_error ;
2023-03-15 18:18:11 +03:00
avail = ctxt - > input - > end - ctxt - > input - > cur ;
2023-05-18 18:31:44 +03:00
if ( avail < 2 )
goto incomplete_sequence ;
if ( ( cur [ 1 ] & 0xc0 ) ! = 0x80 )
2001-02-23 20:55:21 +03:00
goto encoding_error ;
if ( ( c & 0xe0 ) = = 0xe0 ) {
2023-05-18 18:31:44 +03:00
if ( avail < 3 )
goto incomplete_sequence ;
if ( ( cur [ 2 ] & 0xc0 ) ! = 0x80 )
2001-02-23 20:55:21 +03:00
goto encoding_error ;
if ( ( c & 0xf0 ) = = 0xf0 ) {
2023-05-18 18:31:44 +03:00
if ( avail < 4 )
goto incomplete_sequence ;
2002-03-21 00:55:57 +03:00
if ( ( ( c & 0xf8 ) ! = 0xf0 ) | |
2023-05-18 18:31:44 +03:00
( ( cur [ 3 ] & 0xc0 ) ! = 0x80 ) )
2001-02-23 20:55:21 +03:00
goto encoding_error ;
/* 4-byte code */
* len = 4 ;
val = ( cur [ 0 ] & 0x7 ) < < 18 ;
val | = ( cur [ 1 ] & 0x3f ) < < 12 ;
val | = ( cur [ 2 ] & 0x3f ) < < 6 ;
val | = cur [ 3 ] & 0x3f ;
2008-01-11 11:00:57 +03:00
if ( val < 0x10000 )
goto encoding_error ;
2001-02-23 20:55:21 +03:00
} else {
/* 3-byte code */
* len = 3 ;
val = ( cur [ 0 ] & 0xf ) < < 12 ;
val | = ( cur [ 1 ] & 0x3f ) < < 6 ;
val | = cur [ 2 ] & 0x3f ;
2008-01-11 11:00:57 +03:00
if ( val < 0x800 )
goto encoding_error ;
2001-02-23 20:55:21 +03:00
}
} else {
/* 2-byte code */
* len = 2 ;
val = ( cur [ 0 ] & 0x1f ) < < 6 ;
val | = cur [ 1 ] & 0x3f ;
2008-01-11 11:00:57 +03:00
if ( val < 0x80 )
goto encoding_error ;
2001-02-23 20:55:21 +03:00
}
if ( ! IS_CHAR ( val ) ) {
2003-10-06 01:33:18 +04:00
xmlErrEncodingInt ( ctxt , XML_ERR_INVALID_CHAR ,
" Char 0x%X out of allowed range \n " , val ) ;
2012-09-11 09:26:36 +04:00
}
2001-02-23 20:55:21 +03:00
return ( val ) ;
} else {
/* 1-byte code */
* len = 1 ;
2008-01-11 11:00:57 +03:00
if ( ( * ctxt - > input - > cur = = 0 ) & &
( ctxt - > input - > end > ctxt - > input - > cur ) ) {
xmlErrEncodingInt ( ctxt , XML_ERR_INVALID_CHAR ,
" Char 0x0 out of allowed range \n " , 0 ) ;
}
2001-02-23 20:55:21 +03:00
if ( * ctxt - > input - > cur = = 0xD ) {
2002-03-21 00:55:57 +03:00
if ( ctxt - > input - > cur [ 1 ] = = 0xA ) {
2001-02-23 20:55:21 +03:00
ctxt - > input - > cur + + ;
}
return ( 0xA ) ;
}
2022-09-01 02:18:30 +03:00
return ( * ctxt - > input - > cur ) ;
2001-02-23 20:55:21 +03:00
}
}
/*
2001-10-10 13:45:09 +04:00
* Assume it ' s a fixed length encoding ( 1 ) with
2001-12-31 19:16:02 +03:00
* a compatible encoding for the ASCII set , since
2001-02-23 20:55:21 +03:00
* XML constructs only use < 128 chars
*/
* len = 1 ;
if ( * ctxt - > input - > cur = = 0xD ) {
2002-03-21 00:55:57 +03:00
if ( ctxt - > input - > cur [ 1 ] = = 0xA ) {
2001-02-23 20:55:21 +03:00
ctxt - > input - > cur + + ;
}
return ( 0xA ) ;
}
2022-09-01 02:18:30 +03:00
return ( * ctxt - > input - > cur ) ;
2017-08-30 15:16:01 +03:00
2023-05-18 18:31:44 +03:00
encoding_error :
2001-02-23 20:55:21 +03:00
/*
* If we detect an UTF8 error that probably mean that the
2001-12-31 19:16:02 +03:00
* input encoding didn ' t get properly advertised in the
2001-02-23 20:55:21 +03:00
* declaration header . Report the error and switch the encoding
* to ISO - Latin - 1 ( if you don ' t like this policy , just declare the
* encoding ! )
*/
2023-05-18 18:31:44 +03:00
if ( ctxt - > input - > end - ctxt - > input - > cur < 4 ) {
__xmlErrEncoding ( ctxt , XML_ERR_INVALID_CHAR ,
" Input is not proper UTF-8, indicate encoding ! \n " ,
NULL , NULL ) ;
} else {
2004-11-09 17:59:59 +03:00
char buffer [ 150 ] ;
2017-08-30 15:16:01 +03:00
snprintf ( & buffer [ 0 ] , 149 , " Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
ctxt - > input - > cur [ 0 ] , ctxt - > input - > cur [ 1 ] ,
ctxt - > input - > cur [ 2 ] , ctxt - > input - > cur [ 3 ] ) ;
2004-11-09 17:59:59 +03:00
__xmlErrEncoding ( ctxt , XML_ERR_INVALID_CHAR ,
" Input is not proper UTF-8, indicate encoding ! \n %s " ,
BAD_CAST buffer , NULL ) ;
2001-02-23 20:55:21 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt - > input - > flags & XML_INPUT_HAS_ENCODING ) = = 0 ) {
ctxt - > input - > flags | = XML_INPUT_HAS_ENCODING ;
ctxt - > input - > flags | = XML_INPUT_8_BIT ;
}
2001-02-23 20:55:21 +03:00
* len = 1 ;
2022-09-01 02:18:30 +03:00
return ( * ctxt - > input - > cur ) ;
2023-05-18 18:31:44 +03:00
incomplete_sequence :
/*
* An encoding problem may arise from a truncated input buffer
* splitting a character in the middle . In that case do not raise
* an error but return 0. This should only happen when push parsing
* char data .
*/
* len = 0 ;
return ( 0 ) ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlStringCurrentChar :
* @ ctxt : the XML parser context
* @ cur : pointer to the beginning of the char
* @ len : pointer to the length of the char read
*
2023-03-13 21:38:41 +03:00
* DEPRECATED : Internal function , do not use .
*
2001-12-31 19:16:02 +03:00
* The current char value , if using UTF - 8 this may actually span multiple
2001-02-23 20:55:21 +03:00
* bytes in the input buffer .
*
2001-10-10 13:45:09 +04:00
* Returns the current char value and its length
2001-02-23 20:55:21 +03:00
*/
int
2002-01-13 18:43:22 +03:00
xmlStringCurrentChar ( xmlParserCtxtPtr ctxt , const xmlChar * cur , int * len )
{
2004-11-08 17:02:18 +03:00
if ( ( len = = NULL ) | | ( cur = = NULL ) ) return ( 0 ) ;
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt = = NULL ) | | ( ctxt - > input = = NULL ) | |
( ( ctxt - > input - > flags & XML_INPUT_8_BIT ) = = 0 ) ) {
2002-01-13 18:43:22 +03:00
/*
* We are supposed to handle UTF8 , check it ' s valid
* From rfc2044 : encoding of the Unicode values on UTF - 8 :
*
* UCS - 4 range ( hex . ) UTF - 8 octet sequence ( binary )
* 0000 0000 - 0000 007F 0 xxxxxxx
* 0000 00 80 - 0000 07FF 110 xxxxx 10 xxxxxx
2012-09-11 09:26:36 +04:00
* 0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx
2002-01-13 18:43:22 +03:00
*
* Check for the 0x110000 limit too
*/
unsigned char c ;
unsigned int val ;
c = * cur ;
if ( c & 0x80 ) {
if ( ( cur [ 1 ] & 0xc0 ) ! = 0x80 )
goto encoding_error ;
if ( ( c & 0xe0 ) = = 0xe0 ) {
if ( ( cur [ 2 ] & 0xc0 ) ! = 0x80 )
goto encoding_error ;
if ( ( c & 0xf0 ) = = 0xf0 ) {
if ( ( ( c & 0xf8 ) ! = 0xf0 ) | | ( ( cur [ 3 ] & 0xc0 ) ! = 0x80 ) )
goto encoding_error ;
/* 4-byte code */
* len = 4 ;
val = ( cur [ 0 ] & 0x7 ) < < 18 ;
val | = ( cur [ 1 ] & 0x3f ) < < 12 ;
val | = ( cur [ 2 ] & 0x3f ) < < 6 ;
val | = cur [ 3 ] & 0x3f ;
} else {
/* 3-byte code */
* len = 3 ;
val = ( cur [ 0 ] & 0xf ) < < 12 ;
val | = ( cur [ 1 ] & 0x3f ) < < 6 ;
val | = cur [ 2 ] & 0x3f ;
}
} else {
/* 2-byte code */
* len = 2 ;
val = ( cur [ 0 ] & 0x1f ) < < 6 ;
val | = cur [ 1 ] & 0x3f ;
}
if ( ! IS_CHAR ( val ) ) {
2003-10-06 01:33:18 +04:00
xmlErrEncodingInt ( ctxt , XML_ERR_INVALID_CHAR ,
" Char 0x%X out of allowed range \n " , val ) ;
2002-01-13 18:43:22 +03:00
}
return ( val ) ;
} else {
/* 1-byte code */
* len = 1 ;
2022-09-01 02:18:30 +03:00
return ( * cur ) ;
2002-01-13 18:43:22 +03:00
}
2001-02-23 20:55:21 +03:00
}
/*
2001-10-10 13:45:09 +04:00
* Assume it ' s a fixed length encoding ( 1 ) with
2001-12-31 19:16:02 +03:00
* a compatible encoding for the ASCII set , since
2001-02-23 20:55:21 +03:00
* XML constructs only use < 128 chars
*/
* len = 1 ;
2022-09-01 02:18:30 +03:00
return ( * cur ) ;
2001-02-23 20:55:21 +03:00
encoding_error :
2017-08-30 15:16:01 +03:00
/*
* An encoding problem may arise from a truncated input buffer
* splitting a character in the middle . In that case do not raise
2019-09-30 18:04:54 +03:00
* an error but return 0 to indicate an end of stream problem
2017-08-30 15:16:01 +03:00
*/
if ( ( ctxt = = NULL ) | | ( ctxt - > input = = NULL ) | |
( ctxt - > input - > end - ctxt - > input - > cur < 4 ) ) {
* len = 0 ;
return ( 0 ) ;
}
2001-02-23 20:55:21 +03:00
/*
* If we detect an UTF8 error that probably mean that the
2001-12-31 19:16:02 +03:00
* input encoding didn ' t get properly advertised in the
2001-02-23 20:55:21 +03:00
* declaration header . Report the error and switch the encoding
* to ISO - Latin - 1 ( if you don ' t like this policy , just declare the
* encoding ! )
*/
2004-11-09 17:59:59 +03:00
{
char buffer [ 150 ] ;
2017-08-30 15:16:01 +03:00
snprintf ( buffer , 149 , " Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
ctxt - > input - > cur [ 0 ] , ctxt - > input - > cur [ 1 ] ,
ctxt - > input - > cur [ 2 ] , ctxt - > input - > cur [ 3 ] ) ;
2004-11-09 17:59:59 +03:00
__xmlErrEncoding ( ctxt , XML_ERR_INVALID_CHAR ,
" Input is not proper UTF-8, indicate encoding ! \n %s " ,
BAD_CAST buffer , NULL ) ;
2001-02-23 20:55:21 +03:00
}
* len = 1 ;
2022-09-01 02:18:30 +03:00
return ( * cur ) ;
2001-02-23 20:55:21 +03:00
}
/**
2001-03-24 20:00:36 +03:00
* xmlCopyCharMultiByte :
2001-12-31 19:16:02 +03:00
* @ out : pointer to an array of xmlChar
2001-02-23 20:55:21 +03:00
* @ val : the char value
*
2012-09-11 09:26:36 +04:00
* append the char value in the array
2001-02-23 20:55:21 +03:00
*
* Returns the number of xmlChar written
*/
int
2001-03-24 20:00:36 +03:00
xmlCopyCharMultiByte ( xmlChar * out , int val ) {
2022-09-01 03:58:00 +03:00
if ( ( out = = NULL ) | | ( val < 0 ) ) return ( 0 ) ;
2001-02-23 20:55:21 +03:00
/*
* We are supposed to handle UTF8 , check it ' s valid
* From rfc2044 : encoding of the Unicode values on UTF - 8 :
*
* UCS - 4 range ( hex . ) UTF - 8 octet sequence ( binary )
* 0000 0000 - 0000 007F 0 xxxxxxx
* 0000 00 80 - 0000 07FF 110 xxxxx 10 xxxxxx
2012-09-11 09:26:36 +04:00
* 0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx
2001-02-23 20:55:21 +03:00
*/
2001-03-24 20:00:36 +03:00
if ( val > = 0x80 ) {
xmlChar * savedout = out ;
int bits ;
if ( val < 0x800 ) { * out + + = ( val > > 6 ) | 0xC0 ; bits = 0 ; }
else if ( val < 0x10000 ) { * out + + = ( val > > 12 ) | 0xE0 ; bits = 6 ; }
else if ( val < 0x110000 ) { * out + + = ( val > > 18 ) | 0xF0 ; bits = 12 ; }
else {
2003-10-06 01:33:18 +04:00
xmlErrEncodingInt ( NULL , XML_ERR_INVALID_CHAR ,
2001-12-31 19:16:02 +03:00
" Internal error, xmlCopyCharMultiByte 0x%X out of bound \n " ,
2003-10-06 01:33:18 +04:00
val ) ;
2001-02-23 20:55:21 +03:00
return ( 0 ) ;
}
2001-03-24 20:00:36 +03:00
for ( ; bits > = 0 ; bits - = 6 )
* out + + = ( ( val > > bits ) & 0x3F ) | 0x80 ;
return ( out - savedout ) ;
2001-02-23 20:55:21 +03:00
}
2022-09-01 03:58:00 +03:00
* out = val ;
2001-03-24 20:00:36 +03:00
return 1 ;
}
2001-02-23 20:55:21 +03:00
2001-03-24 20:00:36 +03:00
/**
* xmlCopyChar :
* @ len : Ignored , compatibility
2001-12-31 19:16:02 +03:00
* @ out : pointer to an array of xmlChar
2001-03-24 20:00:36 +03:00
* @ val : the char value
*
2012-09-11 09:26:36 +04:00
* append the char value in the array
2001-03-24 20:00:36 +03:00
*
* Returns the number of xmlChar written
*/
2001-02-23 20:55:21 +03:00
2001-03-24 20:00:36 +03:00
int
2001-03-26 20:28:29 +04:00
xmlCopyChar ( int len ATTRIBUTE_UNUSED , xmlChar * out , int val ) {
2022-09-01 03:58:00 +03:00
if ( ( out = = NULL ) | | ( val < 0 ) ) return ( 0 ) ;
2001-03-24 20:00:36 +03:00
/* the len parameter is ignored */
if ( val > = 0x80 ) {
return ( xmlCopyCharMultiByte ( out , val ) ) ;
2001-02-23 20:55:21 +03:00
}
2022-09-01 03:58:00 +03:00
* out = val ;
2001-03-24 20:00:36 +03:00
return 1 ;
2001-02-23 20:55:21 +03:00
}
/************************************************************************
* *
* Commodity functions to switch encodings *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2023-03-21 21:07:12 +03:00
static xmlCharEncodingHandlerPtr
xmlDetectEBCDIC ( xmlParserInputPtr input ) {
xmlChar out [ 200 ] ;
xmlCharEncodingHandlerPtr handler ;
int inlen , outlen , res , i ;
/*
* To detect the EBCDIC code page , we convert the first 200 bytes
* to EBCDIC - US and try to find the encoding declaration .
*/
handler = xmlGetCharEncodingHandler ( XML_CHAR_ENCODING_EBCDIC ) ;
if ( handler = = NULL )
return ( NULL ) ;
2023-03-26 15:11:31 +03:00
outlen = sizeof ( out ) - 1 ;
2023-03-21 21:07:12 +03:00
inlen = input - > end - input - > cur ;
res = xmlEncInputChunk ( handler , out , & outlen , input - > cur , & inlen , 0 ) ;
if ( res < 0 )
return ( handler ) ;
2023-03-26 15:11:31 +03:00
out [ outlen ] = 0 ;
2023-03-21 21:07:12 +03:00
for ( i = 0 ; i < outlen ; i + + ) {
if ( out [ i ] = = ' > ' )
break ;
if ( ( out [ i ] = = ' e ' ) & &
( xmlStrncmp ( out + i , BAD_CAST " encoding " , 8 ) = = 0 ) ) {
int start , cur , quote ;
i + = 8 ;
while ( IS_BLANK_CH ( out [ i ] ) )
i + = 1 ;
if ( out [ i + + ] ! = ' = ' )
break ;
while ( IS_BLANK_CH ( out [ i ] ) )
i + = 1 ;
quote = out [ i + + ] ;
if ( ( quote ! = ' \' ' ) & & ( quote ! = ' " ' ) )
break ;
start = i ;
cur = out [ i ] ;
while ( ( ( cur > = ' a ' ) & & ( cur < = ' z ' ) ) | |
( ( cur > = ' A ' ) & & ( cur < = ' Z ' ) ) | |
( ( cur > = ' 0 ' ) & & ( cur < = ' 9 ' ) ) | |
( cur = = ' . ' ) | | ( cur = = ' _ ' ) | |
( cur = = ' - ' ) )
cur = out [ + + i ] ;
if ( cur ! = quote )
break ;
out [ i ] = 0 ;
xmlCharEncCloseFunc ( handler ) ;
handler = xmlFindCharEncodingHandler ( ( char * ) out + start ) ;
break ;
}
}
return ( handler ) ;
}
2001-02-23 20:55:21 +03:00
/**
* xmlSwitchEncoding :
* @ ctxt : the parser context
* @ enc : the encoding value ( number )
*
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
* Use encoding specified by enum to decode input data .
*
* This function can be used to enforce the encoding of chunks passed
* to xmlParseChunk .
2001-02-23 20:55:21 +03:00
*
* Returns 0 in case of success , - 1 otherwise
*/
int
xmlSwitchEncoding ( xmlParserCtxtPtr ctxt , xmlCharEncoding enc )
{
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
xmlCharEncodingHandlerPtr handler = NULL ;
int check = 1 ;
2015-11-09 13:07:18 +03:00
int ret ;
2001-02-23 20:55:21 +03:00
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( ctxt = = NULL ) | | ( ctxt - > input = = NULL ) )
return ( - 1 ) ;
2023-06-22 19:06:53 +03:00
2001-02-23 20:55:21 +03:00
switch ( enc ) {
case XML_CHAR_ENCODING_NONE :
case XML_CHAR_ENCODING_UTF8 :
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
case XML_CHAR_ENCODING_ASCII :
check = 0 ;
break ;
2023-03-21 21:07:12 +03:00
case XML_CHAR_ENCODING_EBCDIC :
handler = xmlDetectEBCDIC ( ctxt - > input ) ;
break ;
default :
handler = xmlGetCharEncodingHandler ( enc ) ;
break ;
2001-02-23 20:55:21 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( check ) & & ( handler = = NULL ) ) {
const char * name = xmlGetCharEncodingName ( enc ) ;
__xmlErrEncoding ( ctxt , XML_ERR_UNSUPPORTED_ENCODING ,
" encoding not supported: %s \n " ,
BAD_CAST ( name ? name : " <null> " ) , NULL ) ;
2015-11-09 13:07:18 +03:00
/*
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
* TODO : We could recover from errors in external entities
* if we didn ' t stop the parser . But most callers of this
* function don ' t check the return value .
*/
2015-11-09 13:07:18 +03:00
xmlStopParser ( ctxt ) ;
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
return ( - 1 ) ;
2015-11-09 13:07:18 +03:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
ret = xmlSwitchInputEncoding ( ctxt , ctxt - > input , handler ) ;
if ( ( ret > = 0 ) & & ( enc = = XML_CHAR_ENCODING_NONE ) ) {
ctxt - > input - > flags & = ~ XML_INPUT_HAS_ENCODING ;
}
2015-11-09 13:07:18 +03:00
return ( ret ) ;
2001-02-23 20:55:21 +03:00
}
/**
2023-03-21 21:07:12 +03:00
* xmlSwitchInputEncoding :
2001-02-23 20:55:21 +03:00
* @ ctxt : the parser context
2003-10-19 17:35:37 +04:00
* @ input : the input stream
2001-02-23 20:55:21 +03:00
* @ handler : the encoding handler
*
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
* DEPRECATED : Internal function , don ' t use .
*
* Use encoding handler to decode input data .
2001-02-23 20:55:21 +03:00
*
* Returns 0 in case of success , - 1 otherwise
*/
2023-03-21 21:07:12 +03:00
int
xmlSwitchInputEncoding ( xmlParserCtxtPtr ctxt , xmlParserInputPtr input ,
xmlCharEncodingHandlerPtr handler )
2001-02-23 20:55:21 +03:00
{
int nbchars ;
2022-11-13 21:44:00 +03:00
xmlParserInputBufferPtr in ;
2001-02-23 20:55:21 +03:00
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( ( input = = NULL ) | | ( input - > buf = = NULL ) ) {
2022-11-13 21:44:00 +03:00
xmlCharEncCloseFunc ( handler ) ;
return ( - 1 ) ;
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
in = input - > buf ;
2001-02-23 20:55:21 +03:00
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
input - > flags | = XML_INPUT_HAS_ENCODING ;
input - > flags & = ~ XML_INPUT_8_BIT ;
if ( in - > encoder = = handler )
return ( 0 ) ;
2023-04-13 16:11:47 +03:00
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
if ( in - > encoder ! = NULL ) {
2022-11-13 21:44:00 +03:00
/*
2023-04-13 16:11:47 +03:00
* Switching encodings during parsing is a really bad idea ,
2023-06-22 19:06:53 +03:00
* but Chromium can switch between ISO - 8859 - 1 and UTF - 16 before
* separate calls to xmlParseChunk .
2023-04-13 16:11:47 +03:00
*
* TODO : We should check whether the " raw " input buffer is empty and
* convert the old content using the old encoder .
2003-10-19 17:35:37 +04:00
*/
2023-04-13 16:11:47 +03:00
xmlCharEncCloseFunc ( in - > encoder ) ;
in - > encoder = handler ;
2022-11-13 21:44:00 +03:00
return ( 0 ) ;
}
2023-03-21 21:07:12 +03:00
2022-11-13 21:44:00 +03:00
in - > encoder = handler ;
2003-10-19 17:35:37 +04:00
2022-11-13 21:44:00 +03:00
/*
* Is there already some content down the pipe to convert ?
*/
if ( xmlBufIsEmpty ( in - > buffer ) = = 0 ) {
2023-08-08 16:19:51 +03:00
size_t processed ;
2022-11-13 21:44:00 +03:00
/*
* Shrink the current input buffer .
* Move it as the raw buffer and create a new input buffer
*/
processed = input - > cur - input - > base ;
xmlBufShrink ( in - > buffer , processed ) ;
2022-11-20 21:55:12 +03:00
input - > consumed + = processed ;
2022-11-13 21:44:00 +03:00
in - > raw = in - > buffer ;
in - > buffer = xmlBufCreate ( ) ;
in - > rawconsumed = processed ;
2023-04-19 22:55:24 +03:00
/*
* TODO : We must flush and decode the whole buffer to make functions
* like xmlReadMemory work with a user - provided encoding . If the
* encoding is specified directly , we should probably set
* XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
* twice . Then we could set " flush " to false which should save
* a considerable amount of memory when parsing from memory .
* It ' s probably even possible to remove this whole if - block
* completely .
*/
nbchars = xmlCharEncInput ( in , 1 ) ;
2022-11-13 21:44:00 +03:00
xmlBufResetInput ( in - > buffer , input ) ;
if ( nbchars < 0 ) {
2023-03-30 14:53:24 +03:00
/* TODO: This could be an out of memory or an encoding error. */
2022-11-13 21:44:00 +03:00
xmlErrInternal ( ctxt ,
" switching encoding: encoder error \n " ,
NULL ) ;
2023-03-30 14:53:24 +03:00
xmlHaltParser ( ctxt ) ;
2022-11-13 21:44:00 +03:00
return ( - 1 ) ;
}
2003-10-19 17:35:37 +04:00
}
2022-11-13 21:44:00 +03:00
return ( 0 ) ;
2003-10-19 17:35:37 +04:00
}
2009-08-26 13:38:49 +04:00
/**
* xmlSwitchToEncoding :
* @ ctxt : the parser context
* @ handler : the encoding handler
*
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
* Use encoding handler to decode input data .
*
* This function can be used to enforce the encoding of chunks passed
* to xmlParseChunk .
2009-08-26 13:38:49 +04:00
*
* Returns 0 in case of success , - 1 otherwise
*/
int
2012-09-11 09:26:36 +04:00
xmlSwitchToEncoding ( xmlParserCtxtPtr ctxt , xmlCharEncodingHandlerPtr handler )
2009-08-26 13:38:49 +04:00
{
2022-03-07 01:23:43 +03:00
if ( ctxt = = NULL )
return ( - 1 ) ;
2023-03-21 21:07:12 +03:00
return ( xmlSwitchInputEncoding ( ctxt , ctxt - > input , handler ) ) ;
2009-08-26 13:38:49 +04:00
}
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.
Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.
Introduce private helper functions to switch encodings used by both the
XML and HTML parser:
- xmlDetectEncoding which skips over the BOM, allowing to remove the
BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
about encoding mismatches.
If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.
Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)
The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.
The 'standalone' member of xmlParserInput is renamed to 'flags'.
A new parser state XML_PARSER_XML_DECL is added for the push parser.
2023-08-08 16:19:46 +03:00
/**
* xmlDetectEncoding :
* @ ctxt : the parser context
*
* Handle optional BOM , detect and switch to encoding .
*
* Assumes that there are at least four bytes in the input buffer .
*/
void
xmlDetectEncoding ( xmlParserCtxtPtr ctxt ) {
const xmlChar * in = ctxt - > input - > cur ;
xmlCharEncoding enc ;
int bomSize ;
int autoFlag = 0 ;
if ( xmlParserGrow ( ctxt ) < 0 )
return ;
if ( ctxt - > input - > end - in < 4 )
return ;
if ( ctxt - > input - > flags & XML_INPUT_HAS_ENCODING ) {
/*
* If the encoding was already set , only skip the BOM which was
* possibly decoded to UTF - 8.
*/
if ( ( in [ 0 ] = = 0xEF ) & & ( in [ 1 ] = = 0xBB ) & & ( in [ 2 ] = = 0xBF ) ) {
ctxt - > input - > cur + = 3 ;
}
return ;
}
enc = XML_CHAR_ENCODING_NONE ;
bomSize = 0 ;
switch ( in [ 0 ] ) {
case 0x00 :
if ( ( in [ 1 ] = = 0x00 ) & & ( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x3C ) ) {
enc = XML_CHAR_ENCODING_UCS4BE ;
autoFlag = XML_INPUT_AUTO_OTHER ;
} else if ( ( in [ 1 ] = = 0x3C ) & & ( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x3F ) ) {
enc = XML_CHAR_ENCODING_UTF16BE ;
autoFlag = XML_INPUT_AUTO_UTF16BE ;
}
break ;
case 0x3C :
if ( in [ 1 ] = = 0x00 ) {
if ( ( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x00 ) ) {
enc = XML_CHAR_ENCODING_UCS4LE ;
autoFlag = XML_INPUT_AUTO_OTHER ;
} else if ( ( in [ 2 ] = = 0x3F ) & & ( in [ 3 ] = = 0x00 ) ) {
enc = XML_CHAR_ENCODING_UTF16LE ;
autoFlag = XML_INPUT_AUTO_UTF16LE ;
}
}
break ;
case 0x4C :
if ( ( in [ 1 ] = = 0x6F ) & & ( in [ 2 ] = = 0xA7 ) & & ( in [ 3 ] = = 0x94 ) ) {
enc = XML_CHAR_ENCODING_EBCDIC ;
autoFlag = XML_INPUT_AUTO_OTHER ;
}
break ;
case 0xEF :
if ( ( in [ 1 ] = = 0xBB ) & & ( in [ 2 ] = = 0xBF ) ) {
enc = XML_CHAR_ENCODING_UTF8 ;
autoFlag = XML_INPUT_AUTO_UTF8 ;
bomSize = 3 ;
}
break ;
case 0xFE :
if ( in [ 1 ] = = 0xFF ) {
enc = XML_CHAR_ENCODING_UTF16BE ;
autoFlag = XML_INPUT_AUTO_UTF16BE ;
bomSize = 2 ;
}
break ;
case 0xFF :
if ( in [ 1 ] = = 0xFE ) {
enc = XML_CHAR_ENCODING_UTF16LE ;
autoFlag = XML_INPUT_AUTO_UTF16LE ;
bomSize = 2 ;
}
break ;
}
if ( bomSize > 0 ) {
ctxt - > input - > cur + = bomSize ;
}
if ( enc ! = XML_CHAR_ENCODING_NONE ) {
ctxt - > input - > flags | = autoFlag ;
xmlSwitchEncoding ( ctxt , enc ) ;
}
}
/**
* xmlSetDeclaredEncoding :
* @ ctxt : the parser context
* @ encoding : declared encoding
*
* Set the encoding from a declaration in the document .
*
* If no encoding was set yet , switch the encoding . Otherwise , only warn
* about encoding mismatches .
*
* Takes ownership of ' encoding ' .
*/
void
xmlSetDeclaredEncoding ( xmlParserCtxtPtr ctxt , xmlChar * encoding ) {
if ( ctxt - > encoding ! = NULL )
xmlFree ( ( xmlChar * ) ctxt - > encoding ) ;
ctxt - > encoding = encoding ;
if ( ( ( ctxt - > input - > flags & XML_INPUT_HAS_ENCODING ) = = 0 ) & &
( ( ctxt - > options & XML_PARSE_IGNORE_ENC ) = = 0 ) ) {
xmlCharEncodingHandlerPtr handler ;
handler = xmlFindCharEncodingHandler ( ( const char * ) encoding ) ;
if ( handler ! = NULL ) {
xmlSwitchToEncoding ( ctxt , handler ) ;
} else {
__xmlErrEncoding ( ctxt , XML_ERR_UNSUPPORTED_ENCODING ,
" Unsupported encoding: %s \n " ,
encoding , NULL ) ;
}
} else if ( ctxt - > input - > flags & XML_INPUT_AUTO_ENCODING ) {
static const char * allowedUTF8 [ ] = {
" UTF-8 " , " UTF8 " , NULL
} ;
static const char * allowedUTF16LE [ ] = {
" UTF-16 " , " UTF-16LE " , " UTF16 " , NULL
} ;
static const char * allowedUTF16BE [ ] = {
" UTF-16 " , " UTF-16BE " , " UTF16 " , NULL
} ;
const char * * allowed = NULL ;
const char * autoEnc = NULL ;
switch ( ctxt - > input - > flags & XML_INPUT_AUTO_ENCODING ) {
case XML_INPUT_AUTO_UTF8 :
allowed = allowedUTF8 ;
autoEnc = " UTF-8 " ;
break ;
case XML_INPUT_AUTO_UTF16LE :
allowed = allowedUTF16LE ;
autoEnc = " UTF-16LE " ;
break ;
case XML_INPUT_AUTO_UTF16BE :
allowed = allowedUTF16BE ;
autoEnc = " UTF-16BE " ;
break ;
}
if ( allowed ! = NULL ) {
const char * * p ;
int match = 0 ;
for ( p = allowed ; * p ! = NULL ; p + + ) {
if ( xmlStrcasecmp ( encoding , BAD_CAST * p ) = = 0 ) {
match = 1 ;
break ;
}
}
if ( match = = 0 ) {
xmlWarningMsg ( ctxt , XML_WAR_ENCODING_MISMATCH ,
" Encoding '%s' doesn't match "
" auto-detected '%s' \n " ,
encoding , BAD_CAST autoEnc ) ;
}
}
}
}
2001-02-23 20:55:21 +03:00
/************************************************************************
* *
* Commodity functions to handle entities processing *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* xmlFreeInputStream :
* @ input : an xmlParserInputPtr
*
* Free up an input stream .
*/
void
xmlFreeInputStream ( xmlParserInputPtr input ) {
if ( input = = NULL ) return ;
if ( input - > filename ! = NULL ) xmlFree ( ( char * ) input - > filename ) ;
if ( input - > directory ! = NULL ) xmlFree ( ( char * ) input - > directory ) ;
if ( input - > version ! = NULL ) xmlFree ( ( char * ) input - > version ) ;
if ( ( input - > free ! = NULL ) & & ( input - > base ! = NULL ) )
input - > free ( ( xmlChar * ) input - > base ) ;
2012-09-11 09:26:36 +04:00
if ( input - > buf ! = NULL )
2001-02-23 20:55:21 +03:00
xmlFreeParserInputBuffer ( input - > buf ) ;
xmlFree ( input ) ;
}
/**
* xmlNewInputStream :
* @ ctxt : an XML parser context
*
2012-05-15 07:18:40 +04:00
* Create a new input stream structure .
*
2001-02-23 20:55:21 +03:00
* Returns the new input stream or NULL
*/
xmlParserInputPtr
xmlNewInputStream ( xmlParserCtxtPtr ctxt ) {
xmlParserInputPtr input ;
input = ( xmlParserInputPtr ) xmlMalloc ( sizeof ( xmlParserInput ) ) ;
if ( input = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( ctxt , " couldn't allocate a new input stream \n " ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
memset ( input , 0 , sizeof ( xmlParserInput ) ) ;
input - > line = 1 ;
input - > col = 1 ;
2012-05-15 07:18:40 +04:00
2003-09-14 23:56:14 +04:00
/*
2012-05-15 07:18:40 +04:00
* If the context is NULL the id cannot be initialized , but that
* should not happen while parsing which is the situation where
* the id is actually needed .
2003-09-14 23:56:14 +04:00
*/
2022-12-22 17:22:01 +03:00
if ( ctxt ! = NULL ) {
if ( input - > id > = INT_MAX ) {
xmlErrMemory ( ctxt , " Input ID overflow \n " ) ;
return ( NULL ) ;
}
2012-05-15 07:18:40 +04:00
input - > id = ctxt - > input_id + + ;
2022-12-22 17:22:01 +03:00
}
2012-05-15 07:18:40 +04:00
2001-02-23 20:55:21 +03:00
return ( input ) ;
}
/**
* xmlNewIOInputStream :
* @ ctxt : an XML parser context
* @ input : an I / O Input
* @ enc : the charset encoding if known
*
* Create a new input stream structure encapsulating the @ input into
* a stream suitable for the parser .
*
* Returns the new input stream or NULL
*/
xmlParserInputPtr
xmlNewIOInputStream ( xmlParserCtxtPtr ctxt , xmlParserInputBufferPtr input ,
xmlCharEncoding enc ) {
xmlParserInputPtr inputStream ;
2004-11-08 13:52:06 +03:00
if ( input = = NULL ) return ( NULL ) ;
2001-02-23 20:55:21 +03:00
if ( xmlParserDebugEntities )
xmlGenericError ( xmlGenericErrorContext , " new input from I/O \n " ) ;
inputStream = xmlNewInputStream ( ctxt ) ;
if ( inputStream = = NULL ) {
return ( NULL ) ;
}
inputStream - > filename = NULL ;
inputStream - > buf = input ;
2012-07-16 12:28:47 +04:00
xmlBufResetInput ( inputStream - > buf - > buffer , inputStream ) ;
2001-02-23 20:55:21 +03:00
if ( enc ! = XML_CHAR_ENCODING_NONE ) {
xmlSwitchEncoding ( ctxt , enc ) ;
}
return ( inputStream ) ;
}
/**
* xmlNewEntityInputStream :
* @ ctxt : an XML parser context
* @ entity : an Entity pointer
*
2023-03-13 21:38:41 +03:00
* DEPRECATED : Internal function , do not use .
*
2001-02-23 20:55:21 +03:00
* Create a new input stream based on an xmlEntityPtr
*
* Returns the new input stream or NULL
*/
xmlParserInputPtr
xmlNewEntityInputStream ( xmlParserCtxtPtr ctxt , xmlEntityPtr entity ) {
xmlParserInputPtr input ;
if ( entity = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt , " xmlNewEntityInputStream entity = NULL \n " ,
NULL ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
if ( xmlParserDebugEntities )
xmlGenericError ( xmlGenericErrorContext ,
" new input from entity: %s \n " , entity - > name ) ;
if ( entity - > content = = NULL ) {
switch ( entity - > etype ) {
case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY :
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt , " Cannot parse entity %s \n " ,
entity - > name ) ;
2001-02-23 20:55:21 +03:00
break ;
case XML_EXTERNAL_GENERAL_PARSED_ENTITY :
case XML_EXTERNAL_PARAMETER_ENTITY :
2022-12-19 20:39:45 +03:00
input = xmlLoadExternalEntity ( ( char * ) entity - > URI ,
( char * ) entity - > ExternalID , ctxt ) ;
if ( input ! = NULL )
input - > entity = entity ;
return ( input ) ;
2001-02-23 20:55:21 +03:00
case XML_INTERNAL_GENERAL_ENTITY :
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt ,
" Internal entity %s without content ! \n " ,
entity - > name ) ;
2001-02-23 20:55:21 +03:00
break ;
case XML_INTERNAL_PARAMETER_ENTITY :
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt ,
" Internal parameter entity %s without content ! \n " ,
entity - > name ) ;
2001-02-23 20:55:21 +03:00
break ;
case XML_INTERNAL_PREDEFINED_ENTITY :
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt ,
" Predefined entity %s without content ! \n " ,
entity - > name ) ;
2001-02-23 20:55:21 +03:00
break ;
}
return ( NULL ) ;
}
input = xmlNewInputStream ( ctxt ) ;
if ( input = = NULL ) {
return ( NULL ) ;
}
2008-04-28 22:07:29 +04:00
if ( entity - > URI ! = NULL )
input - > filename = ( char * ) xmlStrdup ( ( xmlChar * ) entity - > URI ) ;
2001-02-23 20:55:21 +03:00
input - > base = entity - > content ;
2015-11-20 10:07:38 +03:00
if ( entity - > length = = 0 )
entity - > length = xmlStrlen ( entity - > content ) ;
2001-02-23 20:55:21 +03:00
input - > cur = entity - > content ;
input - > length = entity - > length ;
2001-02-25 19:11:03 +03:00
input - > end = & entity - > content [ input - > length ] ;
2022-12-19 20:39:45 +03:00
input - > entity = entity ;
2001-02-23 20:55:21 +03:00
return ( input ) ;
}
/**
* xmlNewStringInputStream :
* @ ctxt : an XML parser context
* @ buffer : an memory buffer
*
* Create a new input stream based on a memory buffer .
* Returns the new input stream
*/
xmlParserInputPtr
xmlNewStringInputStream ( xmlParserCtxtPtr ctxt , const xmlChar * buffer ) {
xmlParserInputPtr input ;
2022-08-20 16:15:04 +03:00
xmlParserInputBufferPtr buf ;
2001-02-23 20:55:21 +03:00
if ( buffer = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrInternal ( ctxt , " xmlNewStringInputStream string = NULL \n " ,
NULL ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
if ( xmlParserDebugEntities )
xmlGenericError ( xmlGenericErrorContext ,
" new fixed input: %.30s \n " , buffer ) ;
2023-08-08 16:21:28 +03:00
buf = xmlParserInputBufferCreateString ( buffer ) ;
2022-08-20 16:15:04 +03:00
if ( buf = = NULL ) {
xmlErrMemory ( ctxt , NULL ) ;
return ( NULL ) ;
}
2001-02-23 20:55:21 +03:00
input = xmlNewInputStream ( ctxt ) ;
if ( input = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( ctxt , " couldn't allocate a new input stream \n " ) ;
2022-08-20 16:15:04 +03:00
xmlFreeParserInputBuffer ( buf ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
2022-08-20 16:15:04 +03:00
input - > buf = buf ;
xmlBufResetInput ( input - > buf - > buffer , input ) ;
2001-02-23 20:55:21 +03:00
return ( input ) ;
}
/**
* xmlNewInputFromFile :
* @ ctxt : an XML parser context
* @ filename : the filename to use as entity
*
2003-10-19 17:35:37 +04:00
* Create a new input stream based on a file or an URL .
2001-02-23 20:55:21 +03:00
*
* Returns the new input stream or NULL in case of error
*/
xmlParserInputPtr
xmlNewInputFromFile ( xmlParserCtxtPtr ctxt , const char * filename ) {
xmlParserInputBufferPtr buf ;
xmlParserInputPtr inputStream ;
char * directory = NULL ;
xmlChar * URI = NULL ;
if ( xmlParserDebugEntities )
xmlGenericError ( xmlGenericErrorContext ,
" new input from file: %s \n " , filename ) ;
if ( ctxt = = NULL ) return ( NULL ) ;
buf = xmlParserInputBufferCreateFilename ( filename , XML_CHAR_ENCODING_NONE ) ;
2003-10-27 14:25:13 +03:00
if ( buf = = NULL ) {
2005-10-26 13:00:29 +04:00
if ( filename = = NULL )
__xmlLoaderErr ( ctxt ,
" failed to load external entity: NULL filename \n " ,
NULL ) ;
else
__xmlLoaderErr ( ctxt , " failed to load external entity \" %s \" \n " ,
( const char * ) filename ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
2003-10-27 14:25:13 +03:00
}
2001-02-23 20:55:21 +03:00
inputStream = xmlNewInputStream ( ctxt ) ;
2021-07-14 16:43:59 +03:00
if ( inputStream = = NULL ) {
xmlFreeParserInputBuffer ( buf ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
2021-07-14 16:43:59 +03:00
}
2006-03-09 17:13:55 +03:00
2003-10-19 17:35:37 +04:00
inputStream - > buf = buf ;
inputStream = xmlCheckHTTPInput ( ctxt , inputStream ) ;
if ( inputStream = = NULL )
return ( NULL ) ;
2012-09-11 09:26:36 +04:00
2003-10-19 17:35:37 +04:00
if ( inputStream - > filename = = NULL )
URI = xmlStrdup ( ( xmlChar * ) filename ) ;
else
URI = xmlStrdup ( ( xmlChar * ) inputStream - > filename ) ;
directory = xmlParserGetDirectory ( ( const char * ) URI ) ;
2006-10-18 00:32:22 +04:00
if ( inputStream - > filename ! = NULL ) xmlFree ( ( char * ) inputStream - > filename ) ;
2003-09-17 23:36:25 +04:00
inputStream - > filename = ( char * ) xmlCanonicPath ( ( const xmlChar * ) URI ) ;
2003-09-18 00:54:38 +04:00
if ( URI ! = NULL ) xmlFree ( ( char * ) URI ) ;
2001-02-23 20:55:21 +03:00
inputStream - > directory = directory ;
2012-07-16 12:28:47 +04:00
xmlBufResetInput ( inputStream - > buf - > buffer , inputStream ) ;
2001-02-23 20:55:21 +03:00
if ( ( ctxt - > directory = = NULL ) & & ( directory ! = NULL ) )
ctxt - > directory = ( char * ) xmlStrdup ( ( const xmlChar * ) directory ) ;
return ( inputStream ) ;
}
/************************************************************************
* *
* Commodity functions to handle parser contexts *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
2022-08-24 05:21:58 +03:00
* xmlInitSAXParserCtxt :
* @ ctxt : XML parser context
* @ sax : SAX handlert
* @ userData : user data
2001-02-23 20:55:21 +03:00
*
2022-08-24 05:21:58 +03:00
* Initialize a SAX parser context
2003-04-24 20:06:47 +04:00
*
* Returns 0 in case of success and - 1 in case of error
2001-02-23 20:55:21 +03:00
*/
2022-08-24 05:21:58 +03:00
static int
2022-09-01 01:13:19 +03:00
xmlInitSAXParserCtxt ( xmlParserCtxtPtr ctxt , const xmlSAXHandler * sax ,
2022-08-24 05:21:58 +03:00
void * userData )
2001-02-23 20:55:21 +03:00
{
2004-11-09 17:59:59 +03:00
xmlParserInputPtr input ;
2001-08-31 18:55:30 +04:00
if ( ctxt = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrInternal ( NULL , " Got NULL parser context \n " , NULL ) ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-08-31 18:55:30 +04:00
}
2022-03-29 15:18:31 +03:00
xmlInitParser ( ) ;
2001-02-23 20:55:21 +03:00
2004-11-02 17:52:23 +03:00
if ( ctxt - > dict = = NULL )
ctxt - > dict = xmlDictCreate ( ) ;
2003-08-18 16:15:38 +04:00
if ( ctxt - > dict = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2003-08-18 16:15:38 +04:00
return ( - 1 ) ;
}
2012-07-30 06:08:45 +04:00
xmlDictSetLimit ( ctxt - > dict , XML_MAX_DICTIONARY_LIMIT ) ;
2004-11-02 17:52:23 +03:00
if ( ctxt - > sax = = NULL )
ctxt - > sax = ( xmlSAXHandler * ) xmlMalloc ( sizeof ( xmlSAXHandler ) ) ;
2002-11-22 08:07:29 +03:00
if ( ctxt - > sax = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
2022-08-24 05:21:58 +03:00
if ( sax = = NULL ) {
memset ( ctxt - > sax , 0 , sizeof ( xmlSAXHandler ) ) ;
2003-09-25 18:29:29 +04:00
xmlSAXVersion ( ctxt - > sax , 2 ) ;
2022-08-24 05:21:58 +03:00
ctxt - > userData = ctxt ;
} else {
if ( sax - > initialized = = XML_SAX2_MAGIC ) {
memcpy ( ctxt - > sax , sax , sizeof ( xmlSAXHandler ) ) ;
} else {
memset ( ctxt - > sax , 0 , sizeof ( xmlSAXHandler ) ) ;
memcpy ( ctxt - > sax , sax , sizeof ( xmlSAXHandlerV1 ) ) ;
}
ctxt - > userData = userData ? userData : ctxt ;
}
2001-02-23 20:55:21 +03:00
2003-08-19 19:01:28 +04:00
ctxt - > maxatts = 0 ;
ctxt - > atts = NULL ;
2001-02-23 20:55:21 +03:00
/* Allocate the Input stack */
2004-11-02 17:52:23 +03:00
if ( ctxt - > inputTab = = NULL ) {
ctxt - > inputTab = ( xmlParserInputPtr * )
xmlMalloc ( 5 * sizeof ( xmlParserInputPtr ) ) ;
ctxt - > inputMax = 5 ;
}
2001-02-23 20:55:21 +03:00
if ( ctxt - > inputTab = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2001-02-23 20:55:21 +03:00
ctxt - > inputNr = 0 ;
ctxt - > inputMax = 0 ;
ctxt - > input = NULL ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
2004-11-09 17:59:59 +03:00
while ( ( input = inputPop ( ctxt ) ) ! = NULL ) { /* Non consuming */
xmlFreeInputStream ( input ) ;
}
2001-02-23 20:55:21 +03:00
ctxt - > inputNr = 0 ;
ctxt - > input = NULL ;
ctxt - > version = NULL ;
ctxt - > encoding = NULL ;
ctxt - > standalone = - 1 ;
ctxt - > hasExternalSubset = 0 ;
ctxt - > hasPErefs = 0 ;
ctxt - > html = 0 ;
ctxt - > external = 0 ;
ctxt - > instate = XML_PARSER_START ;
ctxt - > token = 0 ;
ctxt - > directory = NULL ;
/* Allocate the Node stack */
2004-11-02 17:52:23 +03:00
if ( ctxt - > nodeTab = = NULL ) {
ctxt - > nodeTab = ( xmlNodePtr * ) xmlMalloc ( 10 * sizeof ( xmlNodePtr ) ) ;
ctxt - > nodeMax = 10 ;
}
2001-02-23 20:55:21 +03:00
if ( ctxt - > nodeTab = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2001-02-23 20:55:21 +03:00
ctxt - > nodeNr = 0 ;
ctxt - > nodeMax = 0 ;
ctxt - > node = NULL ;
ctxt - > inputNr = 0 ;
ctxt - > inputMax = 0 ;
ctxt - > input = NULL ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
ctxt - > nodeNr = 0 ;
ctxt - > node = NULL ;
/* Allocate the Name stack */
2004-11-02 17:52:23 +03:00
if ( ctxt - > nameTab = = NULL ) {
ctxt - > nameTab = ( const xmlChar * * ) xmlMalloc ( 10 * sizeof ( xmlChar * ) ) ;
ctxt - > nameMax = 10 ;
}
2001-02-23 20:55:21 +03:00
if ( ctxt - > nameTab = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2001-02-23 20:55:21 +03:00
ctxt - > nodeNr = 0 ;
ctxt - > nodeMax = 0 ;
ctxt - > node = NULL ;
ctxt - > inputNr = 0 ;
ctxt - > inputMax = 0 ;
ctxt - > input = NULL ;
ctxt - > nameNr = 0 ;
ctxt - > nameMax = 0 ;
ctxt - > name = NULL ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
ctxt - > nameNr = 0 ;
ctxt - > name = NULL ;
/* Allocate the space stack */
2004-11-02 17:52:23 +03:00
if ( ctxt - > spaceTab = = NULL ) {
ctxt - > spaceTab = ( int * ) xmlMalloc ( 10 * sizeof ( int ) ) ;
ctxt - > spaceMax = 10 ;
}
2001-02-23 20:55:21 +03:00
if ( ctxt - > spaceTab = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot initialize parser context \n " ) ;
2001-02-23 20:55:21 +03:00
ctxt - > nodeNr = 0 ;
ctxt - > nodeMax = 0 ;
ctxt - > node = NULL ;
ctxt - > inputNr = 0 ;
ctxt - > inputMax = 0 ;
ctxt - > input = NULL ;
ctxt - > nameNr = 0 ;
ctxt - > nameMax = 0 ;
ctxt - > name = NULL ;
ctxt - > spaceNr = 0 ;
ctxt - > spaceMax = 0 ;
ctxt - > space = NULL ;
2003-04-24 20:06:47 +04:00
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
}
ctxt - > spaceNr = 1 ;
ctxt - > spaceMax = 10 ;
ctxt - > spaceTab [ 0 ] = - 1 ;
ctxt - > space = & ctxt - > spaceTab [ 0 ] ;
ctxt - > myDoc = NULL ;
ctxt - > wellFormed = 1 ;
2003-09-12 03:42:01 +04:00
ctxt - > nsWellFormed = 1 ;
2001-02-23 20:55:21 +03:00
ctxt - > valid = 1 ;
ctxt - > loadsubset = xmlLoadExtDtdDefaultValue ;
2014-06-11 12:59:16 +04:00
if ( ctxt - > loadsubset ) {
ctxt - > options | = XML_PARSE_DTDLOAD ;
}
2001-02-23 20:55:21 +03:00
ctxt - > validate = xmlDoValidityCheckingDefaultValue ;
ctxt - > pedantic = xmlPedanticParserDefaultValue ;
2014-06-11 12:59:16 +04:00
if ( ctxt - > pedantic ) {
ctxt - > options | = XML_PARSE_PEDANTIC ;
}
2001-07-25 21:18:57 +04:00
ctxt - > linenumbers = xmlLineNumbersDefaultValue ;
2001-02-23 20:55:21 +03:00
ctxt - > keepBlanks = xmlKeepBlanksDefaultValue ;
2014-06-11 12:59:16 +04:00
if ( ctxt - > keepBlanks = = 0 ) {
2003-09-26 18:51:39 +04:00
ctxt - > sax - > ignorableWhitespace = xmlSAX2IgnorableWhitespace ;
2014-06-11 12:59:16 +04:00
ctxt - > options | = XML_PARSE_NOBLANKS ;
}
2001-09-14 14:29:27 +04:00
2022-01-13 19:06:14 +03:00
ctxt - > vctxt . flags = XML_VCTXT_USE_PCTXT ;
2001-02-23 20:55:21 +03:00
ctxt - > vctxt . userData = ctxt ;
2002-02-03 23:13:06 +03:00
ctxt - > vctxt . error = xmlParserValidityError ;
ctxt - > vctxt . warning = xmlParserValidityWarning ;
2001-02-23 20:55:21 +03:00
if ( ctxt - > validate ) {
if ( xmlGetWarningsDefaultValue = = 0 )
ctxt - > vctxt . warning = NULL ;
else
ctxt - > vctxt . warning = xmlParserValidityWarning ;
2001-04-21 18:16:10 +04:00
ctxt - > vctxt . nodeMax = 0 ;
2014-06-11 12:59:16 +04:00
ctxt - > options | = XML_PARSE_DTDVALID ;
2001-02-23 20:55:21 +03:00
}
ctxt - > replaceEntities = xmlSubstituteEntitiesDefaultValue ;
2014-06-11 12:59:16 +04:00
if ( ctxt - > replaceEntities ) {
ctxt - > options | = XML_PARSE_NOENT ;
}
2001-02-23 20:55:21 +03:00
ctxt - > record_info = 0 ;
ctxt - > checkIndex = 0 ;
ctxt - > inSubset = 0 ;
ctxt - > errNo = XML_ERR_OK ;
ctxt - > depth = 0 ;
2001-08-22 18:29:45 +04:00
ctxt - > catalogs = NULL ;
2013-02-19 06:21:49 +04:00
ctxt - > sizeentities = 0 ;
ctxt - > sizeentcopy = 0 ;
2012-05-15 07:18:40 +04:00
ctxt - > input_id = 1 ;
2001-02-23 20:55:21 +03:00
xmlInitNodeInfoSeq ( & ctxt - > node_seq ) ;
2003-04-24 20:06:47 +04:00
return ( 0 ) ;
2001-02-23 20:55:21 +03:00
}
2022-08-24 05:21:58 +03:00
/**
* xmlInitParserCtxt :
* @ ctxt : an XML parser context
*
* DEPRECATED : Internal function which will be made private in a future
* version .
*
* Initialize a parser context
*
* Returns 0 in case of success and - 1 in case of error
*/
int
xmlInitParserCtxt ( xmlParserCtxtPtr ctxt )
{
return ( xmlInitSAXParserCtxt ( ctxt , NULL , NULL ) ) ;
}
2001-02-23 20:55:21 +03:00
/**
* xmlFreeParserCtxt :
* @ ctxt : an XML parser context
*
* Free all the memory used by a parser context . However the parsed
* document in ctxt - > myDoc is not freed .
*/
void
xmlFreeParserCtxt ( xmlParserCtxtPtr ctxt )
{
xmlParserInputPtr input ;
if ( ctxt = = NULL ) return ;
while ( ( input = inputPop ( ctxt ) ) ! = NULL ) { /* Non consuming */
xmlFreeInputStream ( input ) ;
}
if ( ctxt - > spaceTab ! = NULL ) xmlFree ( ctxt - > spaceTab ) ;
2003-08-28 14:34:33 +04:00
if ( ctxt - > nameTab ! = NULL ) xmlFree ( ( xmlChar * * ) ctxt - > nameTab ) ;
2001-02-23 20:55:21 +03:00
if ( ctxt - > nodeTab ! = NULL ) xmlFree ( ctxt - > nodeTab ) ;
2010-03-15 17:16:02 +03:00
if ( ctxt - > nodeInfoTab ! = NULL ) xmlFree ( ctxt - > nodeInfoTab ) ;
2001-02-23 20:55:21 +03:00
if ( ctxt - > inputTab ! = NULL ) xmlFree ( ctxt - > inputTab ) ;
if ( ctxt - > version ! = NULL ) xmlFree ( ( char * ) ctxt - > version ) ;
if ( ctxt - > encoding ! = NULL ) xmlFree ( ( char * ) ctxt - > encoding ) ;
if ( ctxt - > extSubURI ! = NULL ) xmlFree ( ( char * ) ctxt - > extSubURI ) ;
if ( ctxt - > extSubSystem ! = NULL ) xmlFree ( ( char * ) ctxt - > extSubSystem ) ;
2003-09-30 04:43:48 +04:00
# ifdef LIBXML_SAX1_ENABLED
2003-09-25 18:29:29 +04:00
if ( ( ctxt - > sax ! = NULL ) & &
( ctxt - > sax ! = ( xmlSAXHandlerPtr ) & xmlDefaultSAXHandler ) )
2003-09-30 04:43:48 +04:00
# else
if ( ctxt - > sax ! = NULL )
# endif /* LIBXML_SAX1_ENABLED */
2001-02-23 20:55:21 +03:00
xmlFree ( ctxt - > sax ) ;
if ( ctxt - > directory ! = NULL ) xmlFree ( ( char * ) ctxt - > directory ) ;
2001-06-19 15:07:54 +04:00
if ( ctxt - > vctxt . nodeTab ! = NULL ) xmlFree ( ctxt - > vctxt . nodeTab ) ;
2003-08-28 14:34:33 +04:00
if ( ctxt - > atts ! = NULL ) xmlFree ( ( xmlChar * * ) ctxt - > atts ) ;
2003-08-18 16:15:38 +04:00
if ( ctxt - > dict ! = NULL ) xmlDictFree ( ctxt - > dict ) ;
2004-02-09 15:39:02 +03:00
if ( ctxt - > nsTab ! = NULL ) xmlFree ( ( char * ) ctxt - > nsTab ) ;
2003-09-10 14:50:59 +04:00
if ( ctxt - > pushTab ! = NULL ) xmlFree ( ctxt - > pushTab ) ;
if ( ctxt - > attallocs ! = NULL ) xmlFree ( ctxt - > attallocs ) ;
2012-09-11 09:26:36 +04:00
if ( ctxt - > attsDefault ! = NULL )
2017-11-09 18:42:47 +03:00
xmlHashFree ( ctxt - > attsDefault , xmlHashDefaultDeallocator ) ;
2003-09-10 14:50:59 +04:00
if ( ctxt - > attsSpecial ! = NULL )
xmlHashFree ( ctxt - > attsSpecial , NULL ) ;
2003-09-17 14:26:25 +04:00
if ( ctxt - > freeElems ! = NULL ) {
xmlNodePtr cur , next ;
cur = ctxt - > freeElems ;
while ( cur ! = NULL ) {
next = cur - > next ;
xmlFree ( cur ) ;
cur = next ;
}
}
if ( ctxt - > freeAttrs ! = NULL ) {
xmlAttrPtr cur , next ;
cur = ctxt - > freeAttrs ;
while ( cur ! = NULL ) {
next = cur - > next ;
xmlFree ( cur ) ;
cur = next ;
}
}
2003-10-03 02:28:19 +04:00
/*
* cleanup the error strings
*/
if ( ctxt - > lastError . message ! = NULL )
xmlFree ( ctxt - > lastError . message ) ;
if ( ctxt - > lastError . file ! = NULL )
xmlFree ( ctxt - > lastError . file ) ;
if ( ctxt - > lastError . str1 ! = NULL )
xmlFree ( ctxt - > lastError . str1 ) ;
if ( ctxt - > lastError . str2 ! = NULL )
xmlFree ( ctxt - > lastError . str2 ) ;
if ( ctxt - > lastError . str3 ! = NULL )
xmlFree ( ctxt - > lastError . str3 ) ;
2003-09-07 13:14:37 +04:00
2001-08-22 18:29:45 +04:00
# ifdef LIBXML_CATALOG_ENABLED
if ( ctxt - > catalogs ! = NULL )
xmlCatalogFreeLocal ( ctxt - > catalogs ) ;
# endif
2001-02-23 20:55:21 +03:00
xmlFree ( ctxt ) ;
}
/**
* xmlNewParserCtxt :
*
* Allocate and initialize a new parser context .
*
* Returns the xmlParserCtxtPtr or NULL
*/
xmlParserCtxtPtr
2005-07-30 02:02:24 +04:00
xmlNewParserCtxt ( void )
2022-08-24 05:21:58 +03:00
{
return ( xmlNewSAXParserCtxt ( NULL , NULL ) ) ;
}
/**
* xmlNewSAXParserCtxt :
* @ sax : SAX handler
* @ userData : user data
*
2022-09-01 01:13:19 +03:00
* Allocate and initialize a new SAX parser context . If userData is NULL ,
* the parser context will be passed as user data .
2022-08-24 05:21:58 +03:00
*
2022-09-01 01:13:19 +03:00
* Returns the xmlParserCtxtPtr or NULL if memory allocation failed .
2022-08-24 05:21:58 +03:00
*/
xmlParserCtxtPtr
2022-09-01 01:13:19 +03:00
xmlNewSAXParserCtxt ( const xmlSAXHandler * sax , void * userData )
2001-02-23 20:55:21 +03:00
{
xmlParserCtxtPtr ctxt ;
ctxt = ( xmlParserCtxtPtr ) xmlMalloc ( sizeof ( xmlParserCtxt ) ) ;
if ( ctxt = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( NULL , " cannot allocate parser context \n " ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
memset ( ctxt , 0 , sizeof ( xmlParserCtxt ) ) ;
2022-08-24 05:21:58 +03:00
if ( xmlInitSAXParserCtxt ( ctxt , sax , userData ) < 0 ) {
2003-04-24 20:06:47 +04:00
xmlFreeParserCtxt ( ctxt ) ;
return ( NULL ) ;
}
2001-02-23 20:55:21 +03:00
return ( ctxt ) ;
}
/************************************************************************
* *
2020-03-08 19:19:42 +03:00
* Handling of node information *
2001-02-23 20:55:21 +03:00
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* xmlClearParserCtxt :
* @ ctxt : an XML parser context
*
* Clear ( release owned resources ) and reinitialize a parser context
*/
void
xmlClearParserCtxt ( xmlParserCtxtPtr ctxt )
{
2001-08-31 18:55:30 +04:00
if ( ctxt = = NULL )
return ;
2001-02-23 20:55:21 +03:00
xmlClearNodeInfoSeq ( & ctxt - > node_seq ) ;
2004-11-02 17:52:23 +03:00
xmlCtxtReset ( ctxt ) ;
2001-02-23 20:55:21 +03:00
}
2004-11-05 20:22:25 +03:00
2001-02-23 20:55:21 +03:00
/**
* xmlParserFindNodeInfo :
2002-12-10 18:19:08 +03:00
* @ ctx : an XML parser context
2001-02-23 20:55:21 +03:00
* @ node : an XML node within the tree
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* Find the parser node info struct for a given node
2012-09-11 09:26:36 +04:00
*
2001-02-23 20:55:21 +03:00
* Returns an xmlParserNodeInfo block pointer or NULL
*/
2004-11-05 20:22:25 +03:00
const xmlParserNodeInfo *
xmlParserFindNodeInfo ( const xmlParserCtxtPtr ctx , const xmlNodePtr node )
2001-02-23 20:55:21 +03:00
{
2004-11-05 20:22:25 +03:00
unsigned long pos ;
if ( ( ctx = = NULL ) | | ( node = = NULL ) )
return ( NULL ) ;
/* Find position where node should be at */
pos = xmlParserFindNodeInfoIndex ( & ctx - > node_seq , node ) ;
if ( pos < ctx - > node_seq . length
& & ctx - > node_seq . buffer [ pos ] . node = = node )
return & ctx - > node_seq . buffer [ pos ] ;
else
return NULL ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlInitNodeInfoSeq :
* @ seq : a node info sequence pointer
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* - - Initialize ( set to initial state ) node info sequence
*/
void
xmlInitNodeInfoSeq ( xmlParserNodeInfoSeqPtr seq )
{
2004-11-05 20:22:25 +03:00
if ( seq = = NULL )
return ;
seq - > length = 0 ;
seq - > maximum = 0 ;
seq - > buffer = NULL ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlClearNodeInfoSeq :
* @ seq : a node info sequence pointer
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* - - Clear ( release memory and reinitialize ) node
* info sequence
*/
void
xmlClearNodeInfoSeq ( xmlParserNodeInfoSeqPtr seq )
{
2004-11-05 20:22:25 +03:00
if ( seq = = NULL )
return ;
if ( seq - > buffer ! = NULL )
xmlFree ( seq - > buffer ) ;
xmlInitNodeInfoSeq ( seq ) ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlParserFindNodeInfoIndex :
* @ seq : a node info sequence pointer
* @ node : an XML node pointer
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
2012-09-11 09:26:36 +04:00
*
2001-02-23 20:55:21 +03:00
* xmlParserFindNodeInfoIndex : Find the index that the info record for
* the given node is or should be at in a sorted sequence
*
* Returns a long indicating the position of the record
*/
2004-11-05 20:22:25 +03:00
unsigned long
xmlParserFindNodeInfoIndex ( const xmlParserNodeInfoSeqPtr seq ,
const xmlNodePtr node )
2001-02-23 20:55:21 +03:00
{
2004-11-05 20:22:25 +03:00
unsigned long upper , lower , middle ;
int found = 0 ;
if ( ( seq = = NULL ) | | ( node = = NULL ) )
2005-12-10 14:11:12 +03:00
return ( ( unsigned long ) - 1 ) ;
2004-11-05 20:22:25 +03:00
/* Do a binary search for the key */
lower = 1 ;
upper = seq - > length ;
middle = 0 ;
while ( lower < = upper & & ! found ) {
middle = lower + ( upper - lower ) / 2 ;
if ( node = = seq - > buffer [ middle - 1 ] . node )
found = 1 ;
else if ( node < seq - > buffer [ middle - 1 ] . node )
upper = middle - 1 ;
else
lower = middle + 1 ;
}
/* Return position */
if ( middle = = 0 | | seq - > buffer [ middle - 1 ] . node < node )
return middle ;
2001-02-23 20:55:21 +03:00
else
2004-11-05 20:22:25 +03:00
return middle - 1 ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlParserAddNodeInfo :
* @ ctxt : an XML parser context
* @ info : a node info sequence pointer
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Don ' t use .
*
2001-02-23 20:55:21 +03:00
* Insert node info record into the sorted sequence
*/
void
2002-01-23 20:53:44 +03:00
xmlParserAddNodeInfo ( xmlParserCtxtPtr ctxt ,
2002-01-21 01:08:18 +03:00
const xmlParserNodeInfoPtr info )
2001-02-23 20:55:21 +03:00
{
2002-01-23 20:53:44 +03:00
unsigned long pos ;
2004-11-05 20:22:25 +03:00
if ( ( ctxt = = NULL ) | | ( info = = NULL ) ) return ;
2002-01-23 20:53:44 +03:00
/* Find pos and check to see if node is already in the sequence */
2003-07-31 18:47:38 +04:00
pos = xmlParserFindNodeInfoIndex ( & ctxt - > node_seq , ( xmlNodePtr )
2002-01-23 20:53:44 +03:00
info - > node ) ;
2006-03-09 17:13:55 +03:00
2012-09-11 09:26:36 +04:00
if ( ( pos < ctxt - > node_seq . length ) & &
2006-03-09 17:13:55 +03:00
( ctxt - > node_seq . buffer ! = NULL ) & &
( ctxt - > node_seq . buffer [ pos ] . node = = info - > node ) ) {
2002-01-23 20:53:44 +03:00
ctxt - > node_seq . buffer [ pos ] = * info ;
}
2001-02-23 20:55:21 +03:00
2002-01-23 20:53:44 +03:00
/* Otherwise, we need to add new node to buffer */
else {
2013-08-03 18:25:13 +04:00
if ( ( ctxt - > node_seq . length + 1 > ctxt - > node_seq . maximum ) | |
( ctxt - > node_seq . buffer = = NULL ) ) {
2002-01-23 20:53:44 +03:00
xmlParserNodeInfo * tmp_buffer ;
unsigned int byte_size ;
if ( ctxt - > node_seq . maximum = = 0 )
ctxt - > node_seq . maximum = 2 ;
byte_size = ( sizeof ( * ctxt - > node_seq . buffer ) *
( 2 * ctxt - > node_seq . maximum ) ) ;
if ( ctxt - > node_seq . buffer = = NULL )
2003-04-22 03:07:45 +04:00
tmp_buffer = ( xmlParserNodeInfo * ) xmlMalloc ( byte_size ) ;
2002-01-23 20:53:44 +03:00
else
tmp_buffer =
( xmlParserNodeInfo * ) xmlRealloc ( ctxt - > node_seq . buffer ,
byte_size ) ;
if ( tmp_buffer = = NULL ) {
2003-10-06 01:33:18 +04:00
xmlErrMemory ( ctxt , " failed to allocate buffer \n " ) ;
2002-01-23 20:53:44 +03:00
return ;
}
ctxt - > node_seq . buffer = tmp_buffer ;
ctxt - > node_seq . maximum * = 2 ;
}
2001-02-23 20:55:21 +03:00
2002-01-23 20:53:44 +03:00
/* If position is not at end, move elements out of the way */
if ( pos ! = ctxt - > node_seq . length ) {
unsigned long i ;
2001-02-23 20:55:21 +03:00
2002-01-23 20:53:44 +03:00
for ( i = ctxt - > node_seq . length ; i > pos ; i - - )
ctxt - > node_seq . buffer [ i ] = ctxt - > node_seq . buffer [ i - 1 ] ;
}
2001-02-23 20:55:21 +03:00
2002-01-23 20:53:44 +03:00
/* Copy element and increase length */
ctxt - > node_seq . buffer [ pos ] = * info ;
ctxt - > node_seq . length + + ;
2001-02-23 20:55:21 +03:00
}
}
2001-07-25 21:18:57 +04:00
/************************************************************************
* *
* Defaults settings *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* xmlPedanticParserDefault :
2012-09-11 09:26:36 +04:00
* @ val : int 0 or 1
2001-07-25 21:18:57 +04:00
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Use the modern options API with XML_PARSE_PEDANTIC .
*
2001-07-25 21:18:57 +04:00
* Set and return the previous value for enabling pedantic warnings .
*
* Returns the last value for 0 for no substitution , 1 for substitution .
*/
int
xmlPedanticParserDefault ( int val ) {
int old = xmlPedanticParserDefaultValue ;
xmlPedanticParserDefaultValue = val ;
return ( old ) ;
}
/**
* xmlLineNumbersDefault :
2012-09-11 09:26:36 +04:00
* @ val : int 0 or 1
2001-07-25 21:18:57 +04:00
*
2022-08-24 16:55:46 +03:00
* DEPRECATED : The modern options API always enables line numbers .
*
2001-07-25 21:18:57 +04:00
* Set and return the previous value for enabling line numbers in elements
* contents . This may break on old application and is turned off by default .
*
* Returns the last value for 0 for no substitution , 1 for substitution .
*/
int
xmlLineNumbersDefault ( int val ) {
int old = xmlLineNumbersDefaultValue ;
xmlLineNumbersDefaultValue = val ;
return ( old ) ;
}
/**
* xmlSubstituteEntitiesDefault :
2012-09-11 09:26:36 +04:00
* @ val : int 0 or 1
2001-07-25 21:18:57 +04:00
*
2022-08-24 16:12:24 +03:00
* DEPRECATED : Use the modern options API with XML_PARSE_NOENT .
*
2001-07-25 21:18:57 +04:00
* Set and return the previous value for default entity support .
* Initially the parser always keep entity references instead of substituting
* entity values in the output . This function has to be used to change the
2001-12-31 19:16:02 +03:00
* default parser behavior
* SAX : : substituteEntities ( ) has to be used for changing that on a file by
2001-07-25 21:18:57 +04:00
* file basis .
*
* Returns the last value for 0 for no substitution , 1 for substitution .
*/
int
xmlSubstituteEntitiesDefault ( int val ) {
int old = xmlSubstituteEntitiesDefaultValue ;
xmlSubstituteEntitiesDefaultValue = val ;
return ( old ) ;
}
/**
* xmlKeepBlanksDefault :
2012-09-11 09:26:36 +04:00
* @ val : int 0 or 1
2001-07-25 21:18:57 +04:00
*
2022-08-24 16:55:46 +03:00
* DEPRECATED : Use the modern options API with XML_PARSE_NOBLANKS .
*
2001-07-25 21:18:57 +04:00
* Set and return the previous value for default blanks text nodes support .
* The 1. x version of the parser used an heuristic to try to detect
* ignorable white spaces . As a result the SAX callback was generating
2003-09-26 18:51:39 +04:00
* xmlSAX2IgnorableWhitespace ( ) callbacks instead of characters ( ) one , and when
2001-07-25 21:18:57 +04:00
* using the DOM output text nodes containing those blanks were not generated .
* The 2. x and later version will switch to the XML standard way and
* ignorableWhitespace ( ) are only generated when running the parser in
* validating mode and when the current element doesn ' t allow CDATA or
* mixed content .
2012-09-11 09:26:36 +04:00
* This function is provided as a way to force the standard behavior
2001-07-25 21:18:57 +04:00
* on 1. X libs and to switch back to the old mode for compatibility when
* running 1. X client code on 2. X . Upgrade of 1. X code should be done
* by using xmlIsBlankNode ( ) commodity function to detect the " empty "
* nodes generated .
* This value also affect autogeneration of indentation when saving code
* if blanks sections are kept , indentation is not generated .
*
* Returns the last value for 0 for no substitution , 1 for substitution .
*/
int
xmlKeepBlanksDefault ( int val ) {
int old = xmlKeepBlanksDefaultValue ;
xmlKeepBlanksDefaultValue = val ;
2009-08-20 14:11:17 +04:00
if ( ! val ) xmlIndentTreeOutput = 1 ;
2001-07-25 21:18:57 +04:00
return ( old ) ;
}