Simple XML handling in Python script

Joined: 21 May 2007
Posts: 1

PostPosted: Mon May 21, 2007 10:10 pm    Post subject: Simple XML handling in Python script

    Simple, self-contained code for reading XML into a tree of nodes
    and using that tree to drive text in an OOWriter document.
    This is not a comprehensive XML solution, but good for basic
    To use, copy module and add your code into final function.
    Jack Trainor    2007-05-21

import uno
import xml.sax
import xml.sax.handler
import xml.sax.saxutils
import StringIO

UnoTrue = uno.Bool(1)
UnoFalse = uno.Bool(0)

PARAGRAPH_BREAK  = uno.getConstantByName( "" )
PAGE_BEFORE = uno.getConstantByName( "" )

""" Node is a node in an XML tree with its tag, text and attributes """
class Node:
    def __init__(self, tag, text="", attributes={}, parent=None):
        self.tag = tag
        self.attributes = attributes
        self.parent = parent  # usually set by Node#addChild
        self.nodes = []
    def filterIllegalAscii(text):
        # Note: discards controls chars and chars > 127 -- doesn't represent them
        return "".join([c for c in text if ((c >= " "  and c <= "~") or c in "\t\r\n")])
    filterIllegalAscii = staticmethod(filterIllegalAscii)
    def escape(text):
        return xml.sax.saxutils.escape(text)
    escape = staticmethod(escape)
    def clean(text):
        text = Node.filterIllegalAscii(text)
        text = Node.escape(text)
        return text
    clean = staticmethod(clean)
    def cleanAttr(text):
        text = text.replace('"', "&quote;")
        text = Node.clean(text)
        return text
    cleanAttr = staticmethod(cleanAttr)
    def setText(self, text):
        self.text = ""

    def appendText(self, text):
        self.text +=Node.clean(text)

    def addChild(self, node):
        if node:
            node.parent = self
""" NodeHandler interfaces with Python SAX to build tree of Nodes  """
class NodeHandler( xml.sax.handler.ContentHandler ):       
    def __init__( self ):
        self.stack = []
        self.root = None

    def getCurNode( self ):
        node = None
        nodeCount = len( self.stack )
        if nodeCount > 0:
            node = self.stack[ nodeCount-1 ]
        return node
    def pushNode( self, node ):
        self.stack.append( node )
    def popNode( self ):
        node = self.getCurNode()
        assert node != None
        if node != None:
            self.stack = self.stack[ :-1 ]
        return node
    def startElement( self, name, attributes=None ):
        node = Node( name, '', attributes )       
        if self.root == None:
            self.root = node           
        curNode = self.getCurNode()
        if curNode != None:
            curNode.addChild( node )           
        self.pushNode( node )

    def characters( self, data ):
        node = self.getCurNode()
        assert node != None

    def endElement( self, name="" ):
        node = self.popNode()
        node.text = node.text.strip()
        if name:
            assert node.tag == name

def parseSource( source ):
    parser = xml.sax.make_parser()
    handler = NodeHandler()
    parser.setContentHandler( handler )   
    parser.parse( source )
    return handler.root

def parseString(s):
    return parseSource(StringIO.StringIO(s))

def parseFile(path):
    return parseSource(path)

def PythonXmlSample( ):
    document = XSCRIPTCONTEXT.getDocument()
    text = document.Text 
    cursor = text.createTextCursor()
    cursor.gotoEnd(UnoFalse )
    # Add your code here...
    # The code  below reads a file with a hard-coded path for an XML file
    # containing titles and texts, and inserts them into current OO document.
    path = r"C:\Docs\Outlines\Book.xml"  # hard-coded path for convenience
    root = parseFile(path)  # parsess xml file at path into one root node

    XML format in this example:
        <item title="This Is the Item's Title">
        This is the item's text.
        [ ... more items ... ]
    for node in root.nodes:
         # print title in Heading 1 style
        cursor.setPropertyValue("ParaStyleName", "Heading 1")
        text.insertString(cursor, node.attributes.get("title","..."), UnoFalse)
        text.insertControlCharacter(cursor, PARAGRAPH_BREAK, UnoFalse)

        # print text in Text body style followed by a page break
        cursor.setPropertyValue("ParaStyleName", "Text body")
        text.insertString(cursor, "\n\n", UnoFalse)
        text.insertString(cursor, node.text, UnoFalse)
        text.insertControlCharacter(cursor, PARAGRAPH_BREAK, UnoFalse)
        cursor.BreakType = PAGE_BEFORE     
    return None
