Package pyxb :: Package utils :: Module saxutils
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.saxutils

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2013, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for processing XML using a SAX parser. 
 17   
 18  In particular, it provides a L{base content handler class<BaseSAXHandler>} 
 19  that maintains namespace context and element state in a stack; and a L{base 
 20  element state class <SAXElementState>} which records the location of the 
 21  element in the stream.  These classes are extended for specific parsing needs 
 22  (e.g., L{pyxb.binding.saxer}). 
 23  """ 
 24   
 25  import xml.sax 
 26  import xml.sax.handler 
 27  import pyxb.namespace 
 28  import StringIO 
 29  import logging 
 30   
 31  _log = logging.getLogger(__name__) 
 32   
33 -class TracingSAXHandler (xml.sax.handler.ContentHandler):
34 """A SAX handler class which prints each method invocation. 35 """ 36 37 # Whether invocation of handler methods should be traced 38 __trace = False 39
40 - def setDocumentLocator (self, locator):
41 print 'setDocumentLocator %s' % (locator,)
42
43 - def startDocument (self):
44 print 'startDocument'
45
46 - def startPrefixMapping (self, prefix, uri):
47 print 'startPrefixMapping %s %s' % (prefix, uri)
48
49 - def endPrefixMapping (self, prefix):
50 print 'endPrefixMapping %s' % (prefix,)
51
52 - def startElementNS (self, name, qname, attrs):
53 print 'startElementNS %s %s' % (name, qname)
54
55 - def endElementNS (self, name, qname):
56 print 'endElementNS %s %s' % (name, qname)
57
58 - def characters (self, content):
59 print 'characters %s' % (content,)
60
61 - def ignorableWhitespace (self, whitespace):
62 print 'ignorableWhitespace len %d' % (len(whitespace),)
63
64 - def processingInstruction (self, target, data):
65 print 'processingInstruction %s %s' % (target, data)
66
67 -class _NoopSAXHandler (xml.sax.handler.ContentHandler):
68 """A SAX handler class which doesn't do anything. Used to get baseline 69 performance parsing a particular document. 70 """ 71
72 - def setDocumentLocator (self, locator):
73 pass
74
75 - def startDocument (self):
76 pass
77
78 - def startPrefixMapping (self, prefix, uri):
79 pass
80
81 - def endPrefixMapping (self, prefix):
82 pass
83
84 - def startElementNS (self, name, qname, attrs):
85 pass
86
87 - def endElementNS (self, name, qname):
88 pass
89
90 - def characters (self, content):
91 pass
92
93 - def ignorableWhitespace (self, whitespace):
94 pass
95
96 - def processingInstruction (self, target, data):
97 pass
98
99 -class SAXInformationItem (object):
100 """Class used to capture an item discovered in the body of an element.""" 101 102 location = None 103 """Where the item began in the document.""" 104 105 item = None 106 """The item. Generally either character information (as text) or a DOM 107 Node instance or a binding instance.""" 108 109 maybe_element = None 110 """C{False} iff the L{item} is character information as opposed to element content.""" 111 112 element_decl = None 113 """A reference to the 114 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} used for 115 the L{item}. This will be C{None} for element content that does not have 116 an enclosing CTD scope.""" 117
118 - def __init__ (self, location, item, maybe_element, element_decl=None):
119 self.location = location 120 self.item = item 121 self.maybe_element = maybe_element 122 self.element_decl = element_decl
123
124 -class SAXElementState (object):
125 """State corresponding to processing a given element with the SAX 126 model.""" 127
128 - def contentHandler (self):
129 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document.""" 130 return self.__contentHandler
131 __contentHandler = None 132
133 - def parentState (self):
134 """Reference to the SAXElementState of the element enclosing this 135 one.""" 136 return self.__parentState
137 __parentState = None 138
139 - def namespaceContext (self):
140 """The L{pyxb.namespace.resolution.NamespaceContext} used for this 141 binding.""" 142 return self.__namespaceContext
143 __namespaceContext = None 144
145 - def expandedName (self):
146 """The L{expanded name<pyxb.namespace.ExpandedName>} of the 147 element.""" 148 return self.__expandedName
149 __expandedName = None 150
151 - def location (self):
152 """The L{location<pyxb.utils.utility.Location>} corresponding to the 153 element event.""" 154 return self.__location
155 __location = None 156
157 - def content (self):
158 """An accumulation of content to be supplied to the content model when 159 the element end is reached. 160 161 This is a list, with each member being C{(content, element_use, 162 maybe_element)}. C{content} is text or a binding instance; 163 C{element_use} is C{None} or the 164 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create 165 the content; and C{maybe_element} is C{True} iff the content is 166 non-content text.""" 167 return self.__content
168 __content = None 169
170 - def __init__ (self, **kw):
171 self.__expandedName = kw.get('expanded_name') 172 self.__namespaceContext = kw['namespace_context'] 173 self.__parentState = kw.get('parent_state') 174 self.__contentHandler = kw.get('content_handler') 175 assert self.__contentHandler is not None 176 self.__location = self.__contentHandler.location() 177 self.__content = []
178
179 - def addTextContent (self, location, content):
180 """Add the given text as non-element content of the current element. 181 @type content: C{unicode} or C{str} 182 @return: C{self} 183 """ 184 self.__content.append(SAXInformationItem(location, content, False))
185
186 - def addElementContent (self, location, element, element_decl=None):
187 """Add the given binding instance as element content corresponding to 188 the given use. 189 190 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}. 191 192 @param element_decl: The L{element 193 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type. 194 """ 195 self.__content.append(SAXInformationItem(location, element, True, element_decl))
196
197 -class BaseSAXHandler (xml.sax.handler.ContentHandler, object):
198 """A SAX handler class that maintains a stack of enclosing elements and 199 manages namespace declarations. 200 201 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and 202 L{pyxb.binding.saxer.PyXBSAXHandler}. 203 """ 204 205 # An instance of L{pyxb.utils.utility.Location} that will be used to 206 # construct the locations of events as they are received. 207 __locationTemplate = None 208
209 - def location (self):
210 """Return the current location within the SAX-processed document.""" 211 return self.__locationTemplate.newLocation(self.__locator)
212 213 # The callable that creates an instance of (a subclass of) 214 # L{SAXElementState} as required to hold element-specific information as 215 # parsing proceeds. 216 __elementStateConstructor = None 217 218 # The namespace to use when processing a document with an absent default 219 # namespace.
220 - def fallbackNamespace (self):
221 """Return the namespace used to resolve unqualified names with no default namespace.""" 222 return self.__fallbackNamespace
223 __fallbackNamespace = None 224 225 # The namespace context that will be in effect at the start of the next 226 # element, or C{None} if no namespace directive notifications have been 227 # received since the last element start or end. Namespace directive 228 # notifications are received before the notification of element start in 229 # which they apply, and cause a "next namespace context" to be allocated 230 # referencing the current namespace. The directive is applied to the next 231 # context. A non-None next context becomes active on entry to the next 232 # element. The next context is reset to None on entry to and exit from an 233 # element so subsequent new directives are applied to a fresh context 234 # inherited from the current context. 235 __nextNamespaceContext = None 236 237 # The namespace context that is in effect for this element.
238 - def namespaceContext (self):
239 """Return the namespace context used for QName resolution within the 240 current element. 241 242 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}""" 243 return self.__namespaceContext
244 __namespaceContext = None 245 246 # The namespace context in a schema that is including the schema to be 247 # parsed by this handler. This is necessary to handle section 4.2.1 when 248 # a schema with a non-absent target namespace includes a schema with no 249 # target namespace. 250 __includingContext = None 251 252 # A SAX locator object. @todo: Figure out how to associate the 253 # location information with the binding objects. 254 __locator = None 255 256 # The state for the element currently being processed
257 - def elementState (self):
258 return self.__elementState
259 __elementState = None 260 261 # The states for all enclosing elements 262 __elementStateStack = [] 263
264 - def rootObject (self):
265 """Return the binding object corresponding to the top-most 266 element in the document 267 268 @return: An instance of L{basis._TypeBinding_mixin} (most usually a 269 L{basis.complexTypeDefinition}.""" 270 return self.__rootObject
271 __rootObject = None 272
273 - def reset (self):
274 """Reset the state of the handler in preparation for processing a new 275 document. 276 277 @return: C{self} 278 """ 279 self.__namespaceContext = pyxb.namespace.resolution.NamespaceContext(default_namespace=self.__fallbackNamespace, 280 target_namespace=self.__targetNamespace, 281 including_context=self.__includingContext, 282 finalize_target_namespace=False) 283 self.__nextNamespaceContext = None 284 self.__elementState = self.__elementStateConstructor(content_handler=self, 285 namespace_context=self.__namespaceContext) 286 self.__elementStateStack = [] 287 self.__rootObject = None 288 # Note: setDocumentLocator is invoked before startDocument (which 289 # calls this), so this method should not reset it. 290 return self
291
292 - def __init__ (self, **kw):
293 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements. 294 295 @keyword fallback_namespace: Optional namespace to use for unqualified 296 names with no default namespace in scope. Has no effect unless it is 297 an absent namespace. 298 299 @keyword element_state_constructor: Optional callable object that 300 creates instances of L{SAXElementState} that hold element-specific 301 information. Defaults to L{SAXElementState}. 302 303 @keyword target_namespace: Optional namespace to set as the target 304 namespace. If not provided, there is no target namespace (not even an 305 absent one). This is the appropriate situation when processing plain 306 XML documents. 307 308 @keyword location_base: An object to be recorded as the base of all 309 L{pyxb.utils.utility.Location} instances associated with events and 310 objects handled by the parser. 311 """ 312 self.__includingContext = kw.pop('including_context', None) 313 self.__fallbackNamespace = kw.pop('fallback_namespace', None) 314 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState) 315 self.__targetNamespace = kw.pop('target_namespace', None) 316 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
317
318 - def setDocumentLocator (self, locator):
319 """Save the locator object.""" 320 self.__locator = locator
321
322 - def startDocument (self):
323 """Process the start of a document. 324 325 This resets this handler for a new document. 326 @note: setDocumentLocator is invoked before startDocument 327 """ 328 self.reset()
329
331 ns_ctx = self.__nextNamespaceContext 332 if ns_ctx is None: 333 assert self.__namespaceContext is not None 334 ns_ctx = pyxb.namespace.resolution.NamespaceContext(parent_context=self.__namespaceContext) 335 self.__nextNamespaceContext = ns_ctx 336 return ns_ctx
337
338 - def startPrefixMapping (self, prefix, uri):
339 """Implement base class method. 340 341 @note: For this to be invoked, the C{feature_namespaces} feature must 342 be enabled in the SAX parser.""" 343 self.__getOrCreateNextNamespaceContext().processXMLNS(prefix, uri)
344 345 # The NamespaceContext management does not require any action upon 346 # leaving the scope of a namespace directive. 347 #def endPrefixMapping (self, prefix): 348 # pass 349
350 - def startElementNS (self, name, qname, attrs):
351 """Process the start of an element.""" 352 self.__flushPendingText() 353 354 # Get the element name, which is already a tuple with the namespace assigned. 355 expanded_name = pyxb.namespace.ExpandedName(name, fallback_namespace=self.__fallbackNamespace) 356 357 # See if this element supports a targetNamespace attribute. xs:schema 358 # and wsdl:definitions both do. 359 tns_attr = pyxb.namespace.resolution.NamespaceContext._TargetNamespaceAttribute(expanded_name) 360 361 # If we need to assign a target namespace, we need a new context. 362 # Otherwise we use the context created from pending namespace 363 # directives, or we re-use the current context. 364 if tns_attr is not None: 365 ns_ctx = self.__getOrCreateNextNamespaceContext() 366 else: 367 ns_ctx = self.__nextNamespaceContext 368 if ns_ctx is None: 369 # Re-use the active context 370 ns_ctx = self.__namespaceContext 371 else: 372 # Update the active context 373 self.__namespaceContext = ns_ctx 374 self.__nextNamespaceContext = None 375 376 if tns_attr is not None: 377 # Not true for wsdl 378 #assert ns_ctx.targetNamespace() is None 379 ns_ctx.finalizeTargetNamespace(attrs.get(tns_attr.uriTuple()), including_context=self.__includingContext) 380 assert ns_ctx.targetNamespace() is not None 381 382 # Save the state of the enclosing element, and create a new 383 # state for this element. 384 parent_state = self.__elementState 385 self.__elementStateStack.append(self.__elementState) 386 self.__elementState = this_state = self.__elementStateConstructor(content_handler=self, 387 expanded_name=expanded_name, 388 namespace_context=ns_ctx, 389 parent_state=parent_state) 390 return (this_state, parent_state, ns_ctx, expanded_name)
391
392 - def endElementNS (self, name, qname):
393 """Process the completion of an element.""" 394 self.__flushPendingText() 395 396 # Save the state of this element, and restore the state for 397 # the parent to which we are returning. 398 this_state = self.__elementState 399 parent_state = self.__elementState = self.__elementStateStack.pop() 400 # Restore namespace context and prepare for new namespace directives 401 self.__namespaceContext = parent_state.namespaceContext() 402 self.__nextNamespaceContext = None 403 404 return this_state
405 406 # We accumulate consecutive text events into a single event, primarily to 407 # avoid the confusion that results when the value of a simple type is 408 # represented by multiple events, as with "B &amp; W". Also, it's faster 409 # to join them all at once, and to process one content value rather than a 410 # sequence of them. 411 __pendingText = None 412 __pendingTextLocation = None
413 - def __flushPendingText (self):
414 if self.__pendingText: 415 location = self.__pendingTextLocation 416 if location is None: 417 location = self.location() 418 self.__elementState.addTextContent(location, ''.join(self.__pendingText)) 419 self.__pendingTextLocation = None 420 self.__pendingText = []
421
422 - def characters (self, content):
423 """Save the text as content""" 424 if self.__pendingTextLocation is None: 425 self.__pendingTextLocation = self.location() 426 self.__pendingText.append(content)
427
428 - def ignorableWhitespace (self, whitespace):
429 """Save whitespace as content too.""" 430 self.__pendingText.append(whitespace)
431
432 - def processingInstruction (self, target, data):
433 self.__flushPendingText()
434
435 -class _EntityResolver (object):
436 """Dummy used to prevent the SAX parser from crashing when it sees 437 processing instructions that we don't care about."""
438 - def resolveEntity (self, public_id, system_id):
439 return StringIO.StringIO('')
440 441 _CreateParserModules = []
442 -def SetCreateParserModules (create_parser_modules):
443 """Provide list of modules to be used when creating parsers. 444 445 C{xml.sax.make_parser()} takes as a parameter an optional list of modules 446 which allow customization of the parser to be used. Certain parsers have 447 better support for Unicode than others. 448 449 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to 450 be used. 451 452 The default behavior if this function is not called, or if it is called 453 with an empty list or C{None}, is to provide no specific modules, which 454 will result in the system default parser (probably expat). 455 456 @param create_parser_modules: an iterable list of names of modules that 457 provide a C{create_parser} function. Pass C{None} to reset to the system 458 default. """ 459 global _CreateParserModules 460 if create_parser_modules is None: 461 _CreateParserModules = [] 462 else: 463 _CreateParserModules = list(create_parser_modules)
464
465 -def make_parser (**kw):
466 """Extend C{xml.sax.make_parser} to configure the parser the way we 467 need it: 468 469 - C{feature_namespaces} is set to C{True} so we process xmlns 470 directives properly 471 - C{feature_namespace_prefixes} is set to C{False} so we don't get 472 prefixes encoded into our names (probably redundant with the above but 473 still...) 474 475 All keywords not documented here (and C{fallback_namespace}, which is) are 476 passed to the C{content_handler_constructor} if that must be invoked. 477 478 @keyword content_handler: The content handler instance for the 479 parser to use. If not provided, an instance of C{content_handler_constructor} 480 is created and used. 481 @type content_handler: C{xml.sax.handler.ContentHandler} 482 483 @keyword content_handler_constructor: A callable which produces an 484 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is 485 L{BaseSAXHandler}. 486 487 @keyword fallback_namespace: The namespace to use for lookups of 488 unqualified names in absent namespaces; see 489 L{pyxb.namespace.ExpandedName}. This keyword is not used by this 490 function, but is passed to the C{content_handler_constructor}. 491 @type fallback_namespace: L{pyxb.namespace.Namespace} 492 """ 493 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler) 494 content_handler = kw.pop('content_handler', None) 495 if content_handler is None: 496 content_handler = content_handler_constructor(**kw) 497 parser = xml.sax.make_parser(_CreateParserModules) 498 parser.setFeature(xml.sax.handler.feature_namespaces, True) 499 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) 500 parser.setContentHandler(content_handler) 501 # libxml2 doesn't support this feature 502 try: 503 parser.setEntityResolver(_EntityResolver()) 504 except xml.sax.SAXNotSupportedException: 505 pass 506 return parser
507 508 if '__main__' == __name__: 509 import xml.dom.pulldom 510 import xml.dom.minidom 511 import pyxb.utils.saxdom as saxdom 512 import time 513 import lxml.sax 514 import lxml.etree 515 import sys 516 517 Handler = BaseSAXHandler 518 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml' 519 if 1 < len(sys.argv): 520 xml_file = sys.argv[1] 521 xmls = open(xml_file).read() 522 523 dt1 = time.time() 524 dt2 = time.time() 525 dom = xml.dom.minidom.parseString(xmls) 526 dt3 = time.time() 527 528 snt1 = time.time() 529 saxer = make_parser(content_handler=_NoopSAXHandler()) 530 snt2 = time.time() 531 saxer.parse(StringIO.StringIO(xmls)) 532 snt3 = time.time() 533 534 sbt1 = time.time() 535 saxer = make_parser(content_handler=BaseSAXHandler()) 536 sbt2 = time.time() 537 saxer.parse(StringIO.StringIO(xmls)) 538 sbt3 = time.time() 539 540 pdt1 = time.time() 541 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler) 542 h = sdomer.getContentHandler() 543 pdt2 = time.time() 544 sdomer.parse(StringIO.StringIO(xmls)) 545 pdt3 = time.time() 546 547 lst1 = time.time() 548 tree = lxml.etree.fromstring(xmls) 549 lst2 = time.time() 550 lsh = Handler() 551 lxml.sax.saxify(tree, lsh) 552 lst3 = time.time() 553 554 ldt1 = time.time() 555 tree = lxml.etree.fromstring(xmls) 556 ldt2 = time.time() 557 ldh = xml.dom.pulldom.SAX2DOM() 558 lxml.sax.saxify(tree, ldh) 559 ldt3 = time.time() 560 561 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1) 562 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1) 563 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1) 564 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1) 565 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1) 566 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1) 567 568 ## Local Variables: 569 ## fill-column:78 570 ## End: 571