| 13 April 2007 |
So far I have worked a lot with Java and XML but mostly with the DOM parser. For my current project I had to split a big XML document into smaller 'messages' and put these small messages on a queue. Well, if you have the same experience as I do with DOM then you know that that is not going to work without a lot of available memory. So that's why I chose a SAX parser and to be more specific a StAX parser.
One thing that you have to take care of are the namespaces in the original 'batch' xml document. These namespaces can be specified in the root element and when you just 'copy' some inner XML fragments as being new XML documents you might miss these namespaces defined at a 'higher' level. For example see this batch xml-document:
-
<?xml version="1.0" encoding="UTF-8" ?>
-
<root-element xmlns:prefix="http://www.pascalalma.net/xml">
-
<xml-messages xmlns:prefix2="http://www.pascalalma.net/xml2">
-
<prefix:xml-message xml-attr1="msg one">
-
<prefix2:nr xml-attr2="one">1</prefix2:nr>
-
<prefix3:msg
-
xmlns:prefix3="http://www.pascalalma.net/xml3">
-
Message number one
-
</prefix3:msg>
-
</prefix:xml-message>
-
<prefix:xml-message xml-attr1="msg two">
-
<prefix2:nr xml-attr2="two">2</prefix2:nr>
-
<prefix3:msg
-
xmlns:prefix3="http://www.pascalalma.net/xml3">
-
Message number two
-
</prefix3:msg>
-
</prefix:xml-message>
-
<prefix:xml-message xml-attr1="msg three">
-
<prefix2:nr xml-attr2="three">3</prefix2:nr>
-
<prefix3:msg
-
xmlns:prefix3="http://www.pascalalma.net/xml3">
-
Message number three
-
</prefix3:msg>
-
</prefix:xml-message>
-
</xml-messages>
-
</root-element>
What we need is a XML document for each 'prefix:xml-message'-tag we encounter. But simply copy & paste that piece of XML is not going to work because we will then miss the declaration of the 'prefix' namespace. So to avoid that I collect every namespace I encounter before I reach the 'prefix:xml-message'-tag and process this collection of namespaces into the new XML document root element. Here is the complete class doing the whole thing:
-
package net.pascalalma.xml;
-
-
import java.io.InputStream;
-
import java.io.StringWriter;
-
import java.util.HashMap;
-
import java.util.Iterator;
-
import java.util.Map;
-
-
import javax.xml.namespace.QName;
-
import javax.xml.stream.XMLInputFactory;
-
import javax.xml.stream.XMLOutputFactory;
-
import javax.xml.stream.XMLStreamConstants;
-
import javax.xml.stream.XMLStreamException;
-
import javax.xml.stream.XMLStreamReader;
-
import javax.xml.stream.XMLStreamWriter;
-
-
import org.codehaus.stax2.XMLInputFactory2;
-
import org.codehaus.stax2.XMLOutputFactory2;
-
-
-
public class StaxParser {
-
-
private static final QName MSG_TAG = new QName("http://www.pascalalma.net/xml", "xml-message");
-
-
protected static XMLInputFactory xmlInFactory;
-
protected static XMLOutputFactory xmlOutFactory;
-
-
static {
-
-
xmlInFactory = XMLInputFactory2.newInstance();
-
xmlOutFactory = XMLOutputFactory2.newInstance();
-
}
-
-
return xmlInFactory.createXMLStreamReader(xml, ENCODING);
-
-
}
-
-
-
long counter = -1;
-
XMLStreamReader xmlReader = getXMLStream(xml);
-
-
// Namespaces at the elements 'above' the <my-ns:xml-message>
-
Map<string, String> globalNamespaces = new HashMap<string, String>();
-
-
try {
-
while (xmlReader.hasNext()) {
-
switch (xmlReader.getEventType()) {
-
case XMLStreamConstants.START_DOCUMENT:
-
break;
-
case XMLStreamConstants.START_ELEMENT:
-
-
if (MSG_TAG.equals(xmlReader.getName())) {
-
// Start of a 'small' xml-message.
-
handleMessage(xmlMsg);
-
counter++;
-
} else {
-
// Some high-level tag containing more tags detected.
-
// Get the namespaces at the highest level.
-
// After that, we need to add these namespaces at lower level,
-
// for every xml-message
-
int nrOfNamespaces = xmlReader.getNamespaceCount();
-
for (int i = 0; i <nrOfNamespaces; i++) {
-
globalNamespaces.put(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i));
-
}
-
}
-
break;
-
case XMLStreamConstants.END_DOCUMENT:
-
break;
-
default:
-
// LOG.warn("unimplemented event type: " + xmlReader.getEventType());
-
}
-
// move to next event
-
if (xmlReader.hasNext()) {
-
xmlReader.next();
-
}
-
}
-
} catch (XMLStreamException e) {
-
}
-
return counter;
-
-
}
-
-
-
-
/**
-
* Copies all xml after the start element of 'xml-message' until the end element of 'xml-message' .
-
* @param xmlReader the batch xml
-
* @param namespaces the namespaces defined at high level
-
* @return the xml-message
-
* @throws Exception if any exception occurs
-
*/
-
// Untill we reach the end tag we write everything to a buffer that will
-
// contain the xml-message content.
-
XMLStreamWriter xmlMsgWriter = null;
-
-
try {
-
-
xmlMsgWriter = xmlOutFactory.createXMLStreamWriter(sw);
-
-
// Write first element with (global) namespaces
-
xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader
-
.getNamespaceURI());
-
// Add all namespaces at global level (if any) to the new root element
-
while (keys.hasNext()) {
-
xmlMsgWriter.writeNamespace(nsPrefix, namespaces.get(nsPrefix));
-
}
-
// Get the attributes for the current element
-
copyAttributes(xmlReader, xmlMsgWriter);
-
-
-
// Now contiue with the loop
-
xmlReader.next();
-
-
localLoop: while (xmlReader.hasNext()) {
-
-
switch (xmlReader.getEventType()) {
-
case XMLStreamConstants.ATTRIBUTE:
-
copyAttributes(xmlReader, xmlMsgWriter);
-
break;
-
case XMLStreamConstants.CDATA:
-
xmlMsgWriter.writeCData(xmlReader.getText());
-
break;
-
case XMLStreamConstants.CHARACTERS:
-
xmlMsgWriter.writeCharacters(xmlReader.getText());
-
break;
-
case XMLStreamConstants.COMMENT:
-
xmlMsgWriter.writeComment(xmlReader.getText());
-
break;
-
case XMLStreamConstants.DTD:
-
xmlMsgWriter.writeDTD(xmlReader.getText());
-
break;
-
case XMLStreamConstants.NAMESPACE:
-
copyNamespaces(xmlReader, xmlMsgWriter);
-
break;
-
case XMLStreamConstants.START_ELEMENT:
-
// Copy the start element
-
xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader
-
.getNamespaceURI());
-
// Add all attributes for this element
-
copyAttributes(xmlReader, xmlMsgWriter);
-
// Add all namespaces for this element
-
copyNamespaces(xmlReader, xmlMsgWriter);
-
break;
-
case XMLStreamConstants.END_ELEMENT:
-
xmlMsgWriter.writeEndElement();
-
if (MSG_TAG.equals(xmlReader.getName())) {
-
break localLoop;
-
}
-
case XMLStreamConstants.SPACE:
-
// ignore spaces
-
break;
-
default:
-
}
-
// move to next event
-
if (xmlReader.hasNext()) {
-
xmlReader.next();
-
}
-
-
} // end localLoop
-
-
xmlMsgWriter.flush();
-
} catch (XMLStreamException e) {
-
} finally {
-
-
if (xmlMsgWriter != null) {
-
xmlMsgWriter.close();
-
}
-
if (sw != null) {
-
sw.close();
-
}
-
-
}
-
return sw.toString();
-
}
-
-
/**
-
* Copies all attributes found at the current position in the xmlReader to the current
-
* position in the xmlWriter.
-
* @param xmlReader XMLStreamReader
-
* @param xmlWriter XMLStreamWriter
-
* @throws XMLStreamException if any xmlStream related exception occurs
-
*/
-
private void copyAttributes(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException {
-
int numOfAttr = xmlReader.getAttributeCount();
-
for (int i = 0; i <numOfAttr; i++) {
-
xmlWriter.writeAttribute(xmlReader.getAttributePrefix(i), xmlReader.getAttributeNamespace(i),
-
xmlReader.getAttributeLocalName(i), xmlReader.getAttributeValue(i));
-
}
-
}
-
/**
-
* Copies all namespaces found at the current position in the xmlReader to the current
-
* position in the xmlWriter.
-
* @param xmlReader XMLStreamReader
-
* @param xmlWriter XMLStreamWriter
-
* @throws XMLStreamException if any xmlStream related exception occurs
-
*/
-
private void copyNamespaces(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException {
-
int nrOfNamespaces = xmlReader.getNamespaceCount();
-
for (int i = 0; i <nrOfNamespaces; i++) {
-
-
xmlWriter.writeNamespace(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i));
-
}
-
}
-
-
}
-
}
To have this class running you will need the following libraries: stax-api-1.0.1.jar and wstx-asl-3.1.1.jar.
Have fun with it!

