1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 """This module contains support for processing XML using a SAX parser.
17
18 In particular, it provides a L{base content handler class<BaseSAXHandler>}
19 that maintains namespace context and element state in a stack; and a L{base
20 element state class <SAXElementState>} which records the location of the
21 element in the stream. These classes are extended for specific parsing needs
22 (e.g., L{pyxb.binding.saxer}).
23 """
24
25 import xml.sax
26 import xml.sax.handler
27 import pyxb.namespace
28 import StringIO
29 import logging
30
31 _log = logging.getLogger(__name__)
32
34 """A SAX handler class which prints each method invocation.
35 """
36
37
38 __trace = False
39
41 print 'setDocumentLocator %s' % (locator,)
42
45
47 print 'startPrefixMapping %s %s' % (prefix, uri)
48
50 print 'endPrefixMapping %s' % (prefix,)
51
53 print 'startElementNS %s %s' % (name, qname)
54
56 print 'endElementNS %s %s' % (name, qname)
57
59 print 'characters %s' % (content,)
60
62 print 'ignorableWhitespace len %d' % (len(whitespace),)
63
65 print 'processingInstruction %s %s' % (target, data)
66
68 """A SAX handler class which doesn't do anything. Used to get baseline
69 performance parsing a particular document.
70 """
71
74
77
80
83
86
89
92
95
98
123
125 """State corresponding to processing a given element with the SAX
126 model."""
127
128 - def contentHandler (self):
129 """Reference to the C{xml.sxa.handler.ContentHandler} that is processing the document."""
130 return self.__contentHandler
131 __contentHandler = None
132
134 """Reference to the SAXElementState of the element enclosing this
135 one."""
136 return self.__parentState
137 __parentState = None
138
139 - def namespaceContext (self):
140 """The L{pyxb.namespace.resolution.NamespaceContext} used for this
141 binding."""
142 return self.__namespaceContext
143 __namespaceContext = None
144
146 """The L{expanded name<pyxb.namespace.ExpandedName>} of the
147 element."""
148 return self.__expandedName
149 __expandedName = None
150
152 """The L{location<pyxb.utils.utility.Location>} corresponding to the
153 element event."""
154 return self.__location
155 __location = None
156
157 - def content (self):
158 """An accumulation of content to be supplied to the content model when
159 the element end is reached.
160
161 This is a list, with each member being C{(content, element_use,
162 maybe_element)}. C{content} is text or a binding instance;
163 C{element_use} is C{None} or the
164 L{ElementDeclaration<pyxb.binding.content.ElementDeclaration>} instance used to create
165 the content; and C{maybe_element} is C{True} iff the content is
166 non-content text."""
167 return self.__content
168 __content = None
169
178
179 - def addTextContent (self, location, content):
180 """Add the given text as non-element content of the current element.
181 @type content: C{unicode} or C{str}
182 @return: C{self}
183 """
184 self.__content.append(SAXInformationItem(location, content, False))
185
186 - def addElementContent (self, location, element, element_decl=None):
187 """Add the given binding instance as element content corresponding to
188 the given use.
189
190 @param element: Any L{binding instance<pyxb.binding.basis._TypeBinding_mixin>}.
191
192 @param element_decl: The L{element
193 use<pyxb.binding.content.ElementDeclaration>} in the containing complex type.
194 """
195 self.__content.append(SAXInformationItem(location, element, True, element_decl))
196
198 """A SAX handler class that maintains a stack of enclosing elements and
199 manages namespace declarations.
200
201 This is the base for L{pyxb.utils.saxdom._DOMSAXHandler} and
202 L{pyxb.binding.saxer.PyXBSAXHandler}.
203 """
204
205
206
207 __locationTemplate = None
208
212
213
214
215
216 __elementStateConstructor = None
217
218
219
221 """Return the namespace used to resolve unqualified names with no default namespace."""
222 return self.__fallbackNamespace
223 __fallbackNamespace = None
224
225
226
227
228
229
230
231
232
233
234
235 __nextNamespaceContext = None
236
237
238 - def namespaceContext (self):
239 """Return the namespace context used for QName resolution within the
240 current element.
241
242 @return: An instance of L{pyxb.namespace.resolution.NamespaceContext}"""
243 return self.__namespaceContext
244 __namespaceContext = None
245
246
247
248
249
250 __includingContext = None
251
252
253
254 __locator = None
255
256
259 __elementState = None
260
261
262 __elementStateStack = []
263
265 """Return the binding object corresponding to the top-most
266 element in the document
267
268 @return: An instance of L{basis._TypeBinding_mixin} (most usually a
269 L{basis.complexTypeDefinition}."""
270 return self.__rootObject
271 __rootObject = None
272
291
293 """Create a new C{xml.sax.handler.ContentHandler} instance to maintain state relevant to elements.
294
295 @keyword fallback_namespace: Optional namespace to use for unqualified
296 names with no default namespace in scope. Has no effect unless it is
297 an absent namespace.
298
299 @keyword element_state_constructor: Optional callable object that
300 creates instances of L{SAXElementState} that hold element-specific
301 information. Defaults to L{SAXElementState}.
302
303 @keyword target_namespace: Optional namespace to set as the target
304 namespace. If not provided, there is no target namespace (not even an
305 absent one). This is the appropriate situation when processing plain
306 XML documents.
307
308 @keyword location_base: An object to be recorded as the base of all
309 L{pyxb.utils.utility.Location} instances associated with events and
310 objects handled by the parser.
311 """
312 self.__includingContext = kw.pop('including_context', None)
313 self.__fallbackNamespace = kw.pop('fallback_namespace', None)
314 self.__elementStateConstructor = kw.pop('element_state_constructor', SAXElementState)
315 self.__targetNamespace = kw.pop('target_namespace', None)
316 self.__locationTemplate = pyxb.utils.utility.Location(kw.pop('location_base', None))
317
319 """Save the locator object."""
320 self.__locator = locator
321
323 """Process the start of a document.
324
325 This resets this handler for a new document.
326 @note: setDocumentLocator is invoked before startDocument
327 """
328 self.reset()
329
331 ns_ctx = self.__nextNamespaceContext
332 if ns_ctx is None:
333 assert self.__namespaceContext is not None
334 ns_ctx = pyxb.namespace.resolution.NamespaceContext(parent_context=self.__namespaceContext)
335 self.__nextNamespaceContext = ns_ctx
336 return ns_ctx
337
344
345
346
347
348
349
391
405
406
407
408
409
410
411 __pendingText = None
412 __pendingTextLocation = None
421
427
431
434
436 """Dummy used to prevent the SAX parser from crashing when it sees
437 processing instructions that we don't care about."""
439 return StringIO.StringIO('')
440
441 _CreateParserModules = []
443 """Provide list of modules to be used when creating parsers.
444
445 C{xml.sax.make_parser()} takes as a parameter an optional list of modules
446 which allow customization of the parser to be used. Certain parsers have
447 better support for Unicode than others.
448
449 As an example, providing C{["drv_libxml2"]} causes the libxml2 parser to
450 be used.
451
452 The default behavior if this function is not called, or if it is called
453 with an empty list or C{None}, is to provide no specific modules, which
454 will result in the system default parser (probably expat).
455
456 @param create_parser_modules: an iterable list of names of modules that
457 provide a C{create_parser} function. Pass C{None} to reset to the system
458 default. """
459 global _CreateParserModules
460 if create_parser_modules is None:
461 _CreateParserModules = []
462 else:
463 _CreateParserModules = list(create_parser_modules)
464
466 """Extend C{xml.sax.make_parser} to configure the parser the way we
467 need it:
468
469 - C{feature_namespaces} is set to C{True} so we process xmlns
470 directives properly
471 - C{feature_namespace_prefixes} is set to C{False} so we don't get
472 prefixes encoded into our names (probably redundant with the above but
473 still...)
474
475 All keywords not documented here (and C{fallback_namespace}, which is) are
476 passed to the C{content_handler_constructor} if that must be invoked.
477
478 @keyword content_handler: The content handler instance for the
479 parser to use. If not provided, an instance of C{content_handler_constructor}
480 is created and used.
481 @type content_handler: C{xml.sax.handler.ContentHandler}
482
483 @keyword content_handler_constructor: A callable which produces an
484 appropriate instance of (a subclass of) L{BaseSAXHandler}. The default is
485 L{BaseSAXHandler}.
486
487 @keyword fallback_namespace: The namespace to use for lookups of
488 unqualified names in absent namespaces; see
489 L{pyxb.namespace.ExpandedName}. This keyword is not used by this
490 function, but is passed to the C{content_handler_constructor}.
491 @type fallback_namespace: L{pyxb.namespace.Namespace}
492 """
493 content_handler_constructor = kw.pop('content_handler_constructor', BaseSAXHandler)
494 content_handler = kw.pop('content_handler', None)
495 if content_handler is None:
496 content_handler = content_handler_constructor(**kw)
497 parser = xml.sax.make_parser(_CreateParserModules)
498 parser.setFeature(xml.sax.handler.feature_namespaces, True)
499 parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False)
500 parser.setContentHandler(content_handler)
501
502 try:
503 parser.setEntityResolver(_EntityResolver())
504 except xml.sax.SAXNotSupportedException:
505 pass
506 return parser
507
508 if '__main__' == __name__:
509 import xml.dom.pulldom
510 import xml.dom.minidom
511 import pyxb.utils.saxdom as saxdom
512 import time
513 import lxml.sax
514 import lxml.etree
515 import sys
516
517 Handler = BaseSAXHandler
518 xml_file = 'examples/tmsxtvd/tmsdatadirect_sample.xml'
519 if 1 < len(sys.argv):
520 xml_file = sys.argv[1]
521 xmls = open(xml_file).read()
522
523 dt1 = time.time()
524 dt2 = time.time()
525 dom = xml.dom.minidom.parseString(xmls)
526 dt3 = time.time()
527
528 snt1 = time.time()
529 saxer = make_parser(content_handler=_NoopSAXHandler())
530 snt2 = time.time()
531 saxer.parse(StringIO.StringIO(xmls))
532 snt3 = time.time()
533
534 sbt1 = time.time()
535 saxer = make_parser(content_handler=BaseSAXHandler())
536 sbt2 = time.time()
537 saxer.parse(StringIO.StringIO(xmls))
538 sbt3 = time.time()
539
540 pdt1 = time.time()
541 sdomer = make_parser(content_handler_constructor=saxdom._DOMSAXHandler)
542 h = sdomer.getContentHandler()
543 pdt2 = time.time()
544 sdomer.parse(StringIO.StringIO(xmls))
545 pdt3 = time.time()
546
547 lst1 = time.time()
548 tree = lxml.etree.fromstring(xmls)
549 lst2 = time.time()
550 lsh = Handler()
551 lxml.sax.saxify(tree, lsh)
552 lst3 = time.time()
553
554 ldt1 = time.time()
555 tree = lxml.etree.fromstring(xmls)
556 ldt2 = time.time()
557 ldh = xml.dom.pulldom.SAX2DOM()
558 lxml.sax.saxify(tree, ldh)
559 ldt3 = time.time()
560
561 print 'minidom read %f, parse %f, total %f' % (dt2-dt1, dt3-dt2, dt3-dt1)
562 print 'SAX+noop create %f, parse %f, total %f' % (snt2-snt1, snt3-snt2, snt3-snt1)
563 print 'SAX+ns create %f, parse %f, total %f' % (sbt2-sbt1, sbt3-sbt2, sbt3-sbt1)
564 print 'PyXB SAXDOM-based create %f, parse %f, total %f' % (pdt2-pdt1, pdt3-pdt2, pdt3-pdt1)
565 print 'LXML+SAX tree %f, parse %f, total %f' % (lst2-lst1, lst3-lst2, lst3-lst1)
566 print 'LXML+pulldom DOM tree %f, parse %f, total %f' % (ldt2-ldt1, ldt3-ldt2, ldt3-ldt1)
567
568
569
570
571