2004-07-11 18:41:20 +04:00
# -*- coding: iso-8859-1 -*-
2003-01-02 16:00:02 +03:00
""" A SAX2 driver for libxml2, on top of it ' s XmlReader API
USAGE
# put this file (drv_libxml2.py) in PYTHONPATH
import xml . sax
reader = xml . sax . make_parser ( [ " drv_libxml2 " ] )
# ...and the rest is standard python sax.
CAVEATS
- Lexical handlers are supported , except for start / endEntity
( waiting for XmlReader . ResolveEntity ) and start / endDTD
2003-01-21 00:26:34 +03:00
- Error callbacks are not exactly synchronous , they tend
to be invoked before the corresponding content callback ,
because the underlying reader interface parses
data by chunks of 512 bytes
2003-01-02 16:00:02 +03:00
TODO
- search for TODO
- some ErrorHandler events ( warning )
- some ContentHandler events ( setDocumentLocator , skippedEntity )
- EntityResolver ( using libxml2 . ? )
- DTDHandler ( if / when libxml2 exposes such node types )
- DeclHandler ( if / when libxml2 exposes such node types )
- property_xml_string ?
- feature_string_interning ?
- Incremental parser
- additional performance tuning :
- one might cache callbacks to avoid some name lookups
- one might implement a smarter way to pass attributes to startElement
( some kind of lazy evaluation ? )
- there might be room for improvement in start / endPrefixMapping
- other ?
"""
2013-07-11 07:00:54 +04:00
__author__ = " St<EFBFBD> phane Bidoul <sbi@skynet.be> "
2003-01-21 00:26:34 +03:00
__version__ = " 0.3 "
2003-01-02 16:00:02 +03:00
2013-07-11 07:00:54 +04:00
import sys
2003-01-02 16:00:02 +03:00
import codecs
2013-07-11 07:00:54 +04:00
2013-07-12 07:18:11 +04:00
if sys . version_info [ 0 ] < 3 :
2013-07-11 07:00:54 +04:00
__author__ = codecs . unicode_escape_decode ( __author__ ) [ 0 ]
2013-07-12 07:18:11 +04:00
StringTypes = ( str , unicode )
2016-01-18 14:46:41 +03:00
# libxml2 returns strings as UTF8
_decoder = codecs . lookup ( " utf8 " ) [ 1 ]
def _d ( s ) :
if s is None :
return s
else :
return _decoder ( s ) [ 0 ]
2013-07-11 07:00:54 +04:00
else :
2013-07-12 07:18:11 +04:00
StringTypes = str
2016-01-18 14:46:41 +03:00
# s is Unicode `str` already
def _d ( s ) :
return s
2003-01-02 16:00:02 +03:00
from xml . sax . _exceptions import *
from xml . sax import xmlreader , saxutils
from xml . sax . handler import \
feature_namespaces , \
feature_namespace_prefixes , \
feature_string_interning , \
feature_validation , \
feature_external_ges , \
feature_external_pes , \
property_lexical_handler , \
property_declaration_handler , \
property_dom_node , \
property_xml_string
try :
import libxml2
2013-07-12 07:18:11 +04:00
except ImportError :
2003-01-10 00:36:42 +03:00
raise SAXReaderNotAvailable ( " libxml2 not available: " \
2013-07-12 07:18:11 +04:00
" import error was: %s " % sys . exc_info ( ) [ 1 ] )
2003-01-02 16:00:02 +03:00
2003-01-21 00:26:34 +03:00
class Locator ( xmlreader . Locator ) :
""" SAX Locator adapter for libxml2.xmlTextReaderLocator """
def __init__ ( self , locator ) :
self . __locator = locator
def getColumnNumber ( self ) :
" Return the column number where the current event ends. "
return - 1
def getLineNumber ( self ) :
" Return the line number where the current event ends. "
return self . __locator . LineNumber ( )
def getPublicId ( self ) :
" Return the public identifier for the current event. "
return None
def getSystemId ( self ) :
" Return the system identifier for the current event. "
return self . __locator . BaseURI ( )
2003-01-02 16:00:02 +03:00
class LibXml2Reader ( xmlreader . XMLReader ) :
def __init__ ( self ) :
xmlreader . XMLReader . __init__ ( self )
# features
self . __ns = 0
self . __nspfx = 0
self . __validate = 0
2004-09-29 13:04:00 +04:00
self . __extparams = 1
2003-01-02 16:00:02 +03:00
# parsing flag
self . __parsing = 0
# additional handlers
self . __lex_handler = None
self . __decl_handler = None
# error messages accumulator
self . __errors = None
2003-01-21 00:26:34 +03:00
def _errorHandler ( self , arg , msg , severity , locator ) :
2003-01-02 16:00:02 +03:00
if self . __errors is None :
self . __errors = [ ]
2003-01-21 00:26:34 +03:00
self . __errors . append ( ( severity ,
SAXParseException ( msg , None ,
Locator ( locator ) ) ) )
2003-01-02 16:00:02 +03:00
2003-01-21 00:26:34 +03:00
def _reportErrors ( self , fatal ) :
for severity , exception in self . __errors :
if severity in ( libxml2 . PARSER_SEVERITY_VALIDITY_WARNING ,
libxml2 . PARSER_SEVERITY_WARNING ) :
self . _err_handler . warning ( exception )
else :
# when fatal is set, the parse will stop;
# we consider that the last error reported
# is the fatal one.
if fatal and exception is self . __errors [ - 1 ] [ 1 ] :
self . _err_handler . fatalError ( exception )
else :
self . _err_handler . error ( exception )
2003-01-02 16:00:02 +03:00
self . __errors = None
def parse ( self , source ) :
self . __parsing = 1
try :
# prepare source and create reader
2013-07-12 07:18:11 +04:00
if isinstance ( source , StringTypes ) :
2003-01-02 16:00:02 +03:00
reader = libxml2 . newTextReaderFilename ( source )
else :
source = saxutils . prepare_input_source ( source )
input = libxml2 . inputBuffer ( source . getByteStream ( ) )
reader = input . newTextReader ( source . getSystemId ( ) )
2003-01-21 00:26:34 +03:00
reader . SetErrorHandler ( self . _errorHandler , None )
2003-01-02 16:00:02 +03:00
# configure reader
2004-09-29 13:04:00 +04:00
if self . __extparams :
reader . SetParserProp ( libxml2 . PARSER_LOADDTD , 1 )
reader . SetParserProp ( libxml2 . PARSER_DEFAULTATTRS , 1 )
reader . SetParserProp ( libxml2 . PARSER_SUBST_ENTITIES , 1 )
reader . SetParserProp ( libxml2 . PARSER_VALIDATE , self . __validate )
else :
reader . SetParserProp ( libxml2 . PARSER_LOADDTD , 0 )
2003-01-02 16:00:02 +03:00
# we reuse attribute maps (for a slight performance gain)
if self . __ns :
attributesNSImpl = xmlreader . AttributesNSImpl ( { } , { } )
else :
attributesImpl = xmlreader . AttributesImpl ( { } )
# prefixes to pop (for endPrefixMapping)
prefixes = [ ]
# start loop
self . _cont_handler . startDocument ( )
while 1 :
r = reader . Read ( )
# check for errors
if r == 1 :
if not self . __errors is None :
2003-01-21 00:26:34 +03:00
self . _reportErrors ( 0 )
2003-01-02 16:00:02 +03:00
elif r == 0 :
if not self . __errors is None :
2003-01-21 00:26:34 +03:00
self . _reportErrors ( 0 )
break # end of parse
2003-01-02 16:00:02 +03:00
else :
if not self . __errors is None :
2003-01-21 00:26:34 +03:00
self . _reportErrors ( 1 )
2003-01-02 16:00:02 +03:00
else :
self . _err_handler . fatalError ( \
SAXException ( " Read failed (no details available) " ) )
2003-01-21 00:26:34 +03:00
break # fatal parse error
2003-01-02 16:00:02 +03:00
# get node type
nodeType = reader . NodeType ( )
# Element
if nodeType == 1 :
if self . __ns :
eltName = ( _d ( reader . NamespaceUri ( ) ) , \
_d ( reader . LocalName ( ) ) )
eltQName = _d ( reader . Name ( ) )
attributesNSImpl . _attrs = attrs = { }
attributesNSImpl . _qnames = qnames = { }
newPrefixes = [ ]
while reader . MoveToNextAttribute ( ) :
qname = _d ( reader . Name ( ) )
value = _d ( reader . Value ( ) )
if qname . startswith ( " xmlns " ) :
if len ( qname ) > 5 :
newPrefix = qname [ 6 : ]
else :
newPrefix = None
newPrefixes . append ( newPrefix )
self . _cont_handler . startPrefixMapping ( \
newPrefix , value )
if not self . __nspfx :
continue # don't report xmlns attribute
attName = ( _d ( reader . NamespaceUri ( ) ) ,
_d ( reader . LocalName ( ) ) )
qnames [ attName ] = qname
attrs [ attName ] = value
2003-01-21 00:26:34 +03:00
reader . MoveToElement ( )
2003-01-02 16:00:02 +03:00
self . _cont_handler . startElementNS ( \
eltName , eltQName , attributesNSImpl )
if reader . IsEmptyElement ( ) :
self . _cont_handler . endElementNS ( eltName , eltQName )
for newPrefix in newPrefixes :
self . _cont_handler . endPrefixMapping ( newPrefix )
else :
prefixes . append ( newPrefixes )
else :
eltName = _d ( reader . Name ( ) )
attributesImpl . _attrs = attrs = { }
while reader . MoveToNextAttribute ( ) :
attName = _d ( reader . Name ( ) )
attrs [ attName ] = _d ( reader . Value ( ) )
2003-01-21 00:26:34 +03:00
reader . MoveToElement ( )
2003-01-02 16:00:02 +03:00
self . _cont_handler . startElement ( \
eltName , attributesImpl )
if reader . IsEmptyElement ( ) :
self . _cont_handler . endElement ( eltName )
# EndElement
elif nodeType == 15 :
if self . __ns :
self . _cont_handler . endElementNS ( \
( _d ( reader . NamespaceUri ( ) ) , _d ( reader . LocalName ( ) ) ) ,
_d ( reader . Name ( ) ) )
for prefix in prefixes . pop ( ) :
self . _cont_handler . endPrefixMapping ( prefix )
else :
self . _cont_handler . endElement ( _d ( reader . Name ( ) ) )
# Text
elif nodeType == 3 :
self . _cont_handler . characters ( _d ( reader . Value ( ) ) )
# Whitespace
elif nodeType == 13 :
self . _cont_handler . ignorableWhitespace ( _d ( reader . Value ( ) ) )
# SignificantWhitespace
elif nodeType == 14 :
self . _cont_handler . characters ( _d ( reader . Value ( ) ) )
# CDATA
elif nodeType == 4 :
if not self . __lex_handler is None :
self . __lex_handler . startCDATA ( )
self . _cont_handler . characters ( _d ( reader . Value ( ) ) )
if not self . __lex_handler is None :
self . __lex_handler . endCDATA ( )
# EntityReference
elif nodeType == 5 :
if not self . __lex_handler is None :
self . startEntity ( _d ( reader . Name ( ) ) )
reader . ResolveEntity ( )
# EndEntity
elif nodeType == 16 :
if not self . __lex_handler is None :
self . endEntity ( _d ( reader . Name ( ) ) )
# ProcessingInstruction
elif nodeType == 7 :
self . _cont_handler . processingInstruction ( \
_d ( reader . Name ( ) ) , _d ( reader . Value ( ) ) )
# Comment
elif nodeType == 8 :
if not self . __lex_handler is None :
self . __lex_handler . comment ( _d ( reader . Value ( ) ) )
# DocumentType
elif nodeType == 10 :
#if not self.__lex_handler is None:
# self.__lex_handler.startDTD()
pass # TODO (how to detect endDTD? on first non-dtd event?)
# XmlDeclaration
elif nodeType == 17 :
pass # TODO
# Entity
elif nodeType == 6 :
pass # TODO (entity decl)
# Notation (decl)
elif nodeType == 12 :
pass # TODO
# Attribute (never in this loop)
#elif nodeType == 2:
# pass
# Document (not exposed)
#elif nodeType == 9:
# pass
# DocumentFragment (never returned by XmlReader)
#elif nodeType == 11:
# pass
# None
#elif nodeType == 0:
# pass
# -
else :
raise SAXException ( " Unexpected node type %d " % nodeType )
if r == 0 :
self . _cont_handler . endDocument ( )
reader . Close ( )
finally :
self . __parsing = 0
def setDTDHandler ( self , handler ) :
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException ( " DTDHandler not supported " )
def setEntityResolver ( self , resolver ) :
# TODO (when supported, the inherited method works just fine)
raise SAXNotSupportedException ( " EntityResolver not supported " )
def getFeature ( self , name ) :
if name == feature_namespaces :
return self . __ns
elif name == feature_namespace_prefixes :
return self . __nspfx
elif name == feature_validation :
return self . __validate
elif name == feature_external_ges :
return 1 # TODO (does that relate to PARSER_LOADDTD)?
elif name == feature_external_pes :
2004-09-29 13:04:00 +04:00
return self . __extparams
2003-01-02 16:00:02 +03:00
else :
raise SAXNotRecognizedException ( " Feature ' %s ' not recognized " % \
name )
def setFeature ( self , name , state ) :
if self . __parsing :
raise SAXNotSupportedException ( " Cannot set feature %s " \
" while parsing " % name )
if name == feature_namespaces :
self . __ns = state
elif name == feature_namespace_prefixes :
self . __nspfx = state
elif name == feature_validation :
self . __validate = state
elif name == feature_external_ges :
if state == 0 :
# TODO (does that relate to PARSER_LOADDTD)?
raise SAXNotSupportedException ( " Feature ' %s ' not supported " % \
name )
elif name == feature_external_pes :
2004-09-29 13:04:00 +04:00
self . __extparams = state
2003-01-02 16:00:02 +03:00
else :
raise SAXNotRecognizedException ( " Feature ' %s ' not recognized " % \
name )
def getProperty ( self , name ) :
if name == property_lexical_handler :
return self . __lex_handler
elif name == property_declaration_handler :
return self . __decl_handler
else :
raise SAXNotRecognizedException ( " Property ' %s ' not recognized " % \
name )
def setProperty ( self , name , value ) :
if name == property_lexical_handler :
self . __lex_handler = value
elif name == property_declaration_handler :
# TODO: remove if/when libxml2 supports dtd events
raise SAXNotSupportedException ( " Property ' %s ' not supported " % \
name )
self . __decl_handler = value
else :
raise SAXNotRecognizedException ( " Property ' %s ' not recognized " % \
name )
def create_parser ( ) :
return LibXml2Reader ( )