1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 """
42 Provides general XML-related functionality.
43
44 What I'm trying to do here is abstract much of the functionality that directly
45 accesses the DOM tree. This is not so much to "protect" the other code from
46 the DOM, but to standardize the way it's used. It will also help extension
47 authors write code that easily looks more like the rest of Cedar Backup.
48
49 @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren,
50 readFirstChild, readStringList, readString, readInteger, readBoolean,
51 addContainerNode, addStringNode, addIntegerNode, addBooleanNode,
52 TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES
53
54 @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}.
55 @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}.
56 @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML.
57
58 @author: Kenneth J. Pronovici <pronovic@ieee.org>
59 """
60
61
62
63
64
65
66
67 import sys
68 import re
69 import logging
70 import codecs
71 from types import UnicodeType
72 from StringIO import StringIO
73
74
75 from xml.parsers.expat import ExpatError
76 from xml.dom.minidom import Node
77 from xml.dom.minidom import getDOMImplementation
78 from xml.dom.minidom import parseString
79
80
81
82
83
84
85 logger = logging.getLogger("CedarBackup2.log.xml")
86
87 TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
88 FALSE_BOOLEAN_VALUES = [ "N", "n", ]
89 VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
90
91
92
93
94
95
109
111 """
112 Creates a DOM tree used for writing an XML document.
113 @param name: Base name of the document (root node name).
114 @return: Tuple (xmlDom, parentNode) for the new document
115 """
116 impl = getDOMImplementation()
117 xmlDom = impl.createDocument(None, name, None)
118 return (xmlDom, xmlDom.documentElement)
119
120
121
122
123
124
126 """
127 Returns True or False depending on whether the XML node is an element node.
128 """
129 return node.nodeType == Node.ELEMENT_NODE
130
132 """
133 Returns a list of nodes with a given name immediately beneath the
134 parent.
135
136 By "immediately beneath" the parent, we mean from among nodes that are
137 direct children of the passed-in parent node.
138
139 Underneath, we use the Python C{getElementsByTagName} method, which is
140 pretty cool, but which (surprisingly?) returns a list of all children
141 with a given name below the parent, at any level. We just prune that
142 list to include only children whose C{parentNode} matches the passed-in
143 parent.
144
145 @param parent: Parent node to search beneath.
146 @param name: Name of nodes to search for.
147
148 @return: List of child nodes with correct parent, or an empty list if
149 no matching nodes are found.
150 """
151 lst = []
152 if parent is not None:
153 result = parent.getElementsByTagName(name)
154 for entry in result:
155 if entry.parentNode is parent:
156 lst.append(entry)
157 return lst
158
160 """
161 Returns the first child with a given name immediately beneath the parent.
162
163 By "immediately beneath" the parent, we mean from among nodes that are
164 direct children of the passed-in parent node.
165
166 @param parent: Parent node to search beneath.
167 @param name: Name of node to search for.
168
169 @return: First properly-named child of parent, or C{None} if no matching nodes are found.
170 """
171 result = readChildren(parent, name)
172 if result is None or result == []:
173 return None
174 return result[0]
175
177 """
178 Returns a list of the string contents associated with nodes with a given
179 name immediately beneath the parent.
180
181 By "immediately beneath" the parent, we mean from among nodes that are
182 direct children of the passed-in parent node.
183
184 First, we find all of the nodes using L{readChildren}, and then we
185 retrieve the "string contents" of each of those nodes. The returned list
186 has one entry per matching node. We assume that string contents of a
187 given node belong to the first C{TEXT_NODE} child of that node. Nodes
188 which have no C{TEXT_NODE} children are not represented in the returned
189 list.
190
191 @param parent: Parent node to search beneath.
192 @param name: Name of node to search for.
193
194 @return: List of strings as described above, or C{None} if no matching nodes are found.
195 """
196 lst = []
197 result = readChildren(parent, name)
198 for entry in result:
199 if entry.hasChildNodes():
200 for child in entry.childNodes:
201 if child.nodeType == Node.TEXT_NODE:
202 lst.append(child.nodeValue)
203 break
204 if lst == []:
205 lst = None
206 return lst
207
209 """
210 Returns string contents of the first child with a given name immediately
211 beneath the parent.
212
213 By "immediately beneath" the parent, we mean from among nodes that are
214 direct children of the passed-in parent node. We assume that string
215 contents of a given node belong to the first C{TEXT_NODE} child of that
216 node.
217
218 @param parent: Parent node to search beneath.
219 @param name: Name of node to search for.
220
221 @return: String contents of node or C{None} if no matching nodes are found.
222 """
223 result = readStringList(parent, name)
224 if result is None:
225 return None
226 return result[0]
227
229 """
230 Returns integer contents of the first child with a given name immediately
231 beneath the parent.
232
233 By "immediately beneath" the parent, we mean from among nodes that are
234 direct children of the passed-in parent node.
235
236 @param parent: Parent node to search beneath.
237 @param name: Name of node to search for.
238
239 @return: Integer contents of node or C{None} if no matching nodes are found.
240 @raise ValueError: If the string at the location can't be converted to an integer.
241 """
242 result = readString(parent, name)
243 if result is None:
244 return None
245 else:
246 return int(result)
247
249 """
250 Returns long integer contents of the first child with a given name immediately
251 beneath the parent.
252
253 By "immediately beneath" the parent, we mean from among nodes that are
254 direct children of the passed-in parent node.
255
256 @param parent: Parent node to search beneath.
257 @param name: Name of node to search for.
258
259 @return: Long integer contents of node or C{None} if no matching nodes are found.
260 @raise ValueError: If the string at the location can't be converted to an integer.
261 """
262 result = readString(parent, name)
263 if result is None:
264 return None
265 else:
266 return long(result)
267
269 """
270 Returns float contents of the first child with a given name immediately
271 beneath the parent.
272
273 By "immediately beneath" the parent, we mean from among nodes that are
274 direct children of the passed-in parent node.
275
276 @param parent: Parent node to search beneath.
277 @param name: Name of node to search for.
278
279 @return: Float contents of node or C{None} if no matching nodes are found.
280 @raise ValueError: If the string at the location can't be converted to a
281 float value.
282 """
283 result = readString(parent, name)
284 if result is None:
285 return None
286 else:
287 return float(result)
288
290 """
291 Returns boolean contents of the first child with a given name immediately
292 beneath the parent.
293
294 By "immediately beneath" the parent, we mean from among nodes that are
295 direct children of the passed-in parent node.
296
297 The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}.
298
299 @param parent: Parent node to search beneath.
300 @param name: Name of node to search for.
301
302 @return: Boolean contents of node or C{None} if no matching nodes are found.
303 @raise ValueError: If the string at the location can't be converted to a boolean.
304 """
305 result = readString(parent, name)
306 if result is None:
307 return None
308 else:
309 if result in TRUE_BOOLEAN_VALUES:
310 return True
311 elif result in FALSE_BOOLEAN_VALUES:
312 return False
313 else:
314 raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
315
316
317
318
319
320
322 """
323 Adds a container node as the next child of a parent node.
324
325 @param xmlDom: DOM tree as from C{impl.createDocument()}.
326 @param parentNode: Parent node to create child for.
327 @param nodeName: Name of the new container node.
328
329 @return: Reference to the newly-created node.
330 """
331 containerNode = xmlDom.createElement(nodeName)
332 parentNode.appendChild(containerNode)
333 return containerNode
334
336 """
337 Adds a text node as the next child of a parent, to contain a string.
338
339 If the C{nodeValue} is None, then the node will be created, but will be
340 empty (i.e. will contain no text node child).
341
342 @param xmlDom: DOM tree as from C{impl.createDocument()}.
343 @param parentNode: Parent node to create child for.
344 @param nodeName: Name of the new container node.
345 @param nodeValue: The value to put into the node.
346
347 @return: Reference to the newly-created node.
348 """
349 containerNode = addContainerNode(xmlDom, parentNode, nodeName)
350 if nodeValue is not None:
351 textNode = xmlDom.createTextNode(nodeValue)
352 containerNode.appendChild(textNode)
353 return containerNode
354
356 """
357 Adds a text node as the next child of a parent, to contain an integer.
358
359 If the C{nodeValue} is None, then the node will be created, but will be
360 empty (i.e. will contain no text node child).
361
362 The integer will be converted to a string using "%d". The result will be
363 added to the document via L{addStringNode}.
364
365 @param xmlDom: DOM tree as from C{impl.createDocument()}.
366 @param parentNode: Parent node to create child for.
367 @param nodeName: Name of the new container node.
368 @param nodeValue: The value to put into the node.
369
370 @return: Reference to the newly-created node.
371 """
372 if nodeValue is None:
373 return addStringNode(xmlDom, parentNode, nodeName, None)
374 else:
375 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
376
377 -def addLongNode(xmlDom, parentNode, nodeName, nodeValue):
378 """
379 Adds a text node as the next child of a parent, to contain a long integer.
380
381 If the C{nodeValue} is None, then the node will be created, but will be
382 empty (i.e. will contain no text node child).
383
384 The integer will be converted to a string using "%d". The result will be
385 added to the document via L{addStringNode}.
386
387 @param xmlDom: DOM tree as from C{impl.createDocument()}.
388 @param parentNode: Parent node to create child for.
389 @param nodeName: Name of the new container node.
390 @param nodeValue: The value to put into the node.
391
392 @return: Reference to the newly-created node.
393 """
394 if nodeValue is None:
395 return addStringNode(xmlDom, parentNode, nodeName, None)
396 else:
397 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
398
400 """
401 Adds a text node as the next child of a parent, to contain a boolean.
402
403 If the C{nodeValue} is None, then the node will be created, but will be
404 empty (i.e. will contain no text node child).
405
406 Boolean C{True}, or anything else interpreted as C{True} by Python, will
407 be converted to a string "Y". Anything else will be converted to a
408 string "N". The result is added to the document via L{addStringNode}.
409
410 @param xmlDom: DOM tree as from C{impl.createDocument()}.
411 @param parentNode: Parent node to create child for.
412 @param nodeName: Name of the new container node.
413 @param nodeValue: The value to put into the node.
414
415 @return: Reference to the newly-created node.
416 """
417 if nodeValue is None:
418 return addStringNode(xmlDom, parentNode, nodeName, None)
419 else:
420 if nodeValue:
421 return addStringNode(xmlDom, parentNode, nodeName, "Y")
422 else:
423 return addStringNode(xmlDom, parentNode, nodeName, "N")
424
425
426
427
428
429
431 """
432 Serializes a DOM tree and returns the result in a string.
433 @param xmlDom: XML DOM tree to serialize
434 @param indent: Number of spaces to indent, as an integer
435 @return: String form of DOM tree, pretty-printed.
436 """
437 xmlBuffer = StringIO()
438 serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
439 serializer.serialize(xmlDom)
440 xmlData = xmlBuffer.getvalue()
441 xmlBuffer.close()
442 return xmlData
443
445
446 """
447 XML serializer class.
448
449 This is a customized serializer that I hacked together based on what I found
450 in the PyXML distribution. Basically, around release 2.7.0, the only reason
451 I still had around a dependency on PyXML was for the PrettyPrint
452 functionality, and that seemed pointless. So, I stripped the PrettyPrint
453 code out of PyXML and hacked bits of it off until it did just what I needed
454 and no more.
455
456 This code started out being called PrintVisitor, but I decided it makes more
457 sense just calling it a serializer. I've made nearly all of the methods
458 private, and I've added a new high-level serialize() method rather than
459 having clients call C{visit()}.
460
461 Anyway, as a consequence of my hacking with it, this can't quite be called a
462 complete XML serializer any more. I ripped out support for HTML and XHTML,
463 and there is also no longer any support for namespaces (which I took out
464 because this dragged along a lot of extra code, and Cedar Backup doesn't use
465 namespaces). However, everything else should pretty much work as expected.
466
467 @copyright: This code, prior to customization, was part of the PyXML
468 codebase, and before that was part of the 4DOM suite developed by
469 Fourthought, Inc. It its original form, it was Copyright (c) 2000
470 Fourthought Inc, USA; All Rights Reserved.
471 """
472
473 - def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
474 """
475 Initialize a serializer.
476 @param stream: Stream to write output to.
477 @param encoding: Output encoding.
478 @param indent: Number of spaces to indent, as an integer
479 """
480 self.stream = stream
481 self.encoding = encoding
482 self._indent = indent * " "
483 self._depth = 0
484 self._inText = 0
485
487 """
488 Serialize the passed-in XML document.
489 @param xmlDom: XML DOM tree to serialize
490 @raise ValueError: If there's an unknown node type in the document.
491 """
492 self._visit(xmlDom)
493 self.stream.write("\n")
494
499
501 if not self._inText and self._indent:
502 self._write('\n' + self._indent*self._depth)
503 return
504
506 """
507 @raise ValueError: If there's an unknown node type in the document.
508 """
509 if node.nodeType == Node.ELEMENT_NODE:
510 return self._visitElement(node)
511
512 elif node.nodeType == Node.ATTRIBUTE_NODE:
513 return self._visitAttr(node)
514
515 elif node.nodeType == Node.TEXT_NODE:
516 return self._visitText(node)
517
518 elif node.nodeType == Node.CDATA_SECTION_NODE:
519 return self._visitCDATASection(node)
520
521 elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
522 return self._visitEntityReference(node)
523
524 elif node.nodeType == Node.ENTITY_NODE:
525 return self._visitEntity(node)
526
527 elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
528 return self._visitProcessingInstruction(node)
529
530 elif node.nodeType == Node.COMMENT_NODE:
531 return self._visitComment(node)
532
533 elif node.nodeType == Node.DOCUMENT_NODE:
534 return self._visitDocument(node)
535
536 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
537 return self._visitDocumentType(node)
538
539 elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
540 return self._visitDocumentFragment(node)
541
542 elif node.nodeType == Node.NOTATION_NODE:
543 return self._visitNotation(node)
544
545
546 raise ValueError("Unknown node type: %s" % repr(node))
547
549 for curr in node:
550 curr is not exclude and self._visit(curr)
551 return
552
554 for item in node.values():
555 self._visit(item)
556 return
557
565
567 self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
568 self._inText = 0
569 return
570
576
580
582 self._tryIndent()
583 self._write('<%s' % node.tagName)
584 for attr in node.attributes.values():
585 self._visitAttr(attr)
586 if len(node.childNodes):
587 self._write('>')
588 self._depth = self._depth + 1
589 self._visitNodeList(node.childNodes)
590 self._depth = self._depth - 1
591 not (self._inText) and self._tryIndent()
592 self._write('</%s>' % node.tagName)
593 else:
594 self._write('/>')
595 self._inText = 0
596 return
597
598 - def _visitText(self, node):
599 text = node.data
600 if self._indent:
601 text.strip()
602 if text:
603 text = _translateCDATA(text, self.encoding)
604 self.stream.write(text)
605 self._inText = 1
606 return
607
609 if not doctype.systemId and not doctype.publicId: return
610 self._tryIndent()
611 self._write('<!DOCTYPE %s' % doctype.name)
612 if doctype.systemId and '"' in doctype.systemId:
613 system = "'%s'" % doctype.systemId
614 else:
615 system = '"%s"' % doctype.systemId
616 if doctype.publicId and '"' in doctype.publicId:
617
618
619
620 public = "'%s'" % doctype.publicId
621 else:
622 public = '"%s"' % doctype.publicId
623 if doctype.publicId and doctype.systemId:
624 self._write(' PUBLIC %s %s' % (public, system))
625 elif doctype.systemId:
626 self._write(' SYSTEM %s' % system)
627 if doctype.entities or doctype.notations:
628 self._write(' [')
629 self._depth = self._depth + 1
630 self._visitNamedNodeMap(doctype.entities)
631 self._visitNamedNodeMap(doctype.notations)
632 self._depth = self._depth - 1
633 self._tryIndent()
634 self._write(']>')
635 else:
636 self._write('>')
637 self._inText = 0
638 return
639
641 """Visited from a NamedNodeMap in DocumentType"""
642 self._tryIndent()
643 self._write('<!ENTITY %s' % (node.nodeName))
644 node.publicId and self._write(' PUBLIC %s' % node.publicId)
645 node.systemId and self._write(' SYSTEM %s' % node.systemId)
646 node.notationName and self._write(' NDATA %s' % node.notationName)
647 self._write('>')
648 return
649
651 """Visited from a NamedNodeMap in DocumentType"""
652 self._tryIndent()
653 self._write('<!NOTATION %s' % node.nodeName)
654 node.publicId and self._write(' PUBLIC %s' % node.publicId)
655 node.systemId and self._write(' SYSTEM %s' % node.systemId)
656 self._write('>')
657 return
658
660 self._tryIndent()
661 self._write('<![CDATA[%s]]>' % (node.data))
662 self._inText = 0
663 return
664
670
672 self._write('&%s;' % node.nodeName)
673 self._inText = 1
674 return
675
677 self._tryIndent()
678 self._write('<?%s %s?>' % (node.target, node.data))
679 self._inText = 0
680 return
681
682 -def _encodeText(text, encoding):
683 """
684 @copyright: This code, prior to customization, was part of the PyXML
685 codebase, and before that was part of the 4DOM suite developed by
686 Fourthought, Inc. It its original form, it was attributed to Martin v.
687 Löwis and was Copyright (c) 2000 Fourthought Inc, USA; All Rights Reserved.
688 """
689 encoder = codecs.lookup(encoding)[0]
690 if type(text) is not UnicodeType:
691 text = unicode(text, "utf-8")
692 return encoder(text)[0]
693
695 """
696 Handles normalization and some intelligence about quoting.
697
698 @copyright: This code, prior to customization, was part of the PyXML
699 codebase, and before that was part of the 4DOM suite developed by
700 Fourthought, Inc. It its original form, it was Copyright (c) 2000
701 Fourthought Inc, USA; All Rights Reserved.
702 """
703 if not characters:
704 return '', "'"
705 if "'" in characters:
706 delimiter = '"'
707 new_chars = re.sub('"', '"', characters)
708 else:
709 delimiter = "'"
710 new_chars = re.sub("'", ''', characters)
711
712
713
714 if "\n" in characters:
715 new_chars = re.sub('\n', ' ', new_chars)
716 return new_chars, delimiter
717
718
719 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
720 """
721 @copyright: This code, prior to customization, was part of the PyXML
722 codebase, and before that was part of the 4DOM suite developed by
723 Fourthought, Inc. It its original form, it was Copyright (c) 2000
724 Fourthought Inc, USA; All Rights Reserved.
725 """
726 CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
727 CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
728 ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
729 ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
730 XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
731 if not characters:
732 return ''
733 if not markupSafe:
734 if CDATA_CHAR_PATTERN.search(characters):
735 new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
736 else:
737 new_string = characters
738 if prev_chars[-2:] == ']]' and characters[0] == '>':
739 new_string = '>' + new_string[1:]
740 else:
741 new_string = characters
742
743
744
745 if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
746 new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
747 new_string = _encodeText(new_string, encoding)
748 return new_string
749