So far I have worked a lot with Java and XML but mostly with the DOM parser. For my current project I had to split a big XML document into smaller ‘messages’ and put these small messages on a queue. Well, if you have the same experience as I do with DOM then you know that that is not going to work without a lot of available memory. So that’s why I chose a SAX parser and to be more specific a StAX parser.
One thing that you have to take care of are the namespaces in the original ‘batch’ xml document. These namespaces can be specified in the root element and when you just ‘copy’ some inner XML fragments as being new XML documents you might miss these namespaces defined at a ‘higher’ level. For example see this batch xml-document:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | <?xml version="1.0" encoding="UTF-8" ?> <root-element xmlns:prefix="http://www.redstream.nl/xml"> <xml-messages xmlns:prefix2="http://www.redstream.nl/xml2"> <prefix:xml-message xml-attr1="msg one"> <prefix2:nr xml-attr2="one">1</prefix2:nr> <prefix3:msg xmlns:prefix3="http://www.redstream.nl/xml3"> Message number one </prefix3:msg> </prefix:xml-message> <prefix:xml-message xml-attr1="msg two"> <prefix2:nr xml-attr2="two">2</prefix2:nr> <prefix3:msg xmlns:prefix3="http://www.redstream.nl/xml3"> Message number two </prefix3:msg> </prefix:xml-message> <prefix:xml-message xml-attr1="msg three"> <prefix2:nr xml-attr2="three">3</prefix2:nr> <prefix3:msg xmlns:prefix3="http://www.redstream.nl/xml3"> Message number three </prefix3:msg> </prefix:xml-message> </xml-messages> </root-element> |
What we need is a XML document for each ‘prefix:xml-message’-tag we encounter. But simply copy & paste that piece of XML is not going to work because we will then miss the declaration of the ‘prefix’ namespace. So to avoid that I collect every namespace I encounter before I reach the ‘prefix:xml-message’-tag and process this collection of namespaces into the new XML document root element. Here is the complete class doing the whole thing:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | package net.pascalalma.xml; import java.io.InputStream; import java.io.StringWriter; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import javax.xml.namespace.QName; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.XMLStreamWriter; import org.codehaus.stax2.XMLInputFactory2; import org.codehaus.stax2.XMLOutputFactory2; public class StaxParser { private static final QName MSG_TAG = new QName("http://www.redstream.nl/xml", "xml-message"); private static final String ENCODING = "UTF-8"; protected static XMLInputFactory xmlInFactory; protected static XMLOutputFactory xmlOutFactory; static { System.setProperty("javax.xml.stream.XMLInputFactory", "com.ctc.wstx.stax.WstxInputFactory"); System.setProperty("javax.xml.stream.XMLOutputFactory", "com.ctc.wstx.stax.WstxOutputFactory"); System.setProperty("javax.xml.stream.XMLEventFactory", "com.ctc.wstx.stax.WstxEventFactory"); xmlInFactory = XMLInputFactory2.newInstance(); xmlOutFactory = XMLOutputFactory2.newInstance(); } private XMLStreamReader getXMLStream(InputStream xml) throws XMLStreamException { return xmlInFactory.createXMLStreamReader(xml, ENCODING); } public long parseBatchDocument(InputStream xml) throws Exception { long counter = -1; XMLStreamReader xmlReader = getXMLStream(xml); // Namespaces at the elements 'above' the <my-ns:xml-message> Map<string, String> globalNamespaces = new HashMap<string, String>(); try { while (xmlReader.hasNext()) { switch (xmlReader.getEventType()) { case XMLStreamConstants.START_DOCUMENT: System.out.println("Document started"); break; case XMLStreamConstants.START_ELEMENT: if (MSG_TAG.equals(xmlReader.getName())) { // Start of a 'small' xml-message. String xmlMsg = extractMessage(xmlReader, globalNamespaces); handleMessage(xmlMsg); counter++; } else { // Some high-level tag containing more tags detected. // Get the namespaces at the highest level. // After that, we need to add these namespaces at lower level, // for every xml-message int nrOfNamespaces = xmlReader.getNamespaceCount(); for (int i = 0; i < nrOfNamespaces; i++) { globalNamespaces.put(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i)); } } break; case XMLStreamConstants.END_DOCUMENT: System.out.println("Document ended"); break; default: // LOG.warn("unimplemented event type: " + xmlReader.getEventType()); } // move to next event if (xmlReader.hasNext()) { xmlReader.next(); } } } catch (XMLStreamException e) { throw new Exception(e); } return counter; } /** * Copies all xml after the start element of 'xml-message' until the end element of 'xml-message' . * @param xmlReader the batch xml * @param namespaces the namespaces defined at high level * @return the xml-message * @throws Exception if any exception occurs */ private String extractMessage(XMLStreamReader xmlReader, Map<string, String> namespaces) throws Exception { // Untill we reach the end tag we write everything to a buffer that will // contain the xml-message content. StringWriter sw = new StringWriter(); XMLStreamWriter xmlMsgWriter = null; try { xmlMsgWriter = xmlOutFactory.createXMLStreamWriter(sw); // Write first element with (global) namespaces xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader .getNamespaceURI()); // Add all namespaces at global level (if any) to the new root element Iterator keys = namespaces.keySet().iterator(); while (keys.hasNext()) { String nsPrefix = (String) keys.next(); xmlMsgWriter.writeNamespace(nsPrefix, namespaces.get(nsPrefix)); } // Get the attributes for the current element copyAttributes(xmlReader, xmlMsgWriter); // Now contiue with the loop xmlReader.next(); localLoop: while (xmlReader.hasNext()) { switch (xmlReader.getEventType()) { case XMLStreamConstants.ATTRIBUTE: copyAttributes(xmlReader, xmlMsgWriter); break; case XMLStreamConstants.CDATA: xmlMsgWriter.writeCData(xmlReader.getText()); break; case XMLStreamConstants.CHARACTERS: xmlMsgWriter.writeCharacters(xmlReader.getText()); break; case XMLStreamConstants.COMMENT: xmlMsgWriter.writeComment(xmlReader.getText()); break; case XMLStreamConstants.DTD: xmlMsgWriter.writeDTD(xmlReader.getText()); break; case XMLStreamConstants.NAMESPACE: copyNamespaces(xmlReader, xmlMsgWriter); break; case XMLStreamConstants.START_ELEMENT: // Copy the start element xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader .getNamespaceURI()); // Add all attributes for this element copyAttributes(xmlReader, xmlMsgWriter); // Add all namespaces for this element copyNamespaces(xmlReader, xmlMsgWriter); break; case XMLStreamConstants.END_ELEMENT: xmlMsgWriter.writeEndElement(); if (MSG_TAG.equals(xmlReader.getName())) { break localLoop; } case XMLStreamConstants.SPACE: // ignore spaces break; default: System.out.println("Unknown eventType = " + xmlReader.getEventType()); } // move to next event if (xmlReader.hasNext()) { xmlReader.next(); } } // end localLoop xmlMsgWriter.flush(); } catch (XMLStreamException e) { throw new Exception(e); } finally { if (xmlMsgWriter != null) { xmlMsgWriter.close(); } if (sw != null) { sw.close(); } } return sw.toString(); } /** * Copies all attributes found at the current position in the xmlReader to the current * position in the xmlWriter. * @param xmlReader XMLStreamReader * @param xmlWriter XMLStreamWriter * @throws XMLStreamException if any xmlStream related exception occurs */ private void copyAttributes(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException { int numOfAttr = xmlReader.getAttributeCount(); for (int i = 0; i < numOfAttr; i++) { xmlWriter.writeAttribute(xmlReader.getAttributePrefix(i), xmlReader.getAttributeNamespace(i), xmlReader.getAttributeLocalName(i), xmlReader.getAttributeValue(i)); } } /** * Copies all namespaces found at the current position in the xmlReader to the current * position in the xmlWriter. * @param xmlReader XMLStreamReader * @param xmlWriter XMLStreamWriter * @throws XMLStreamException if any xmlStream related exception occurs */ private void copyNamespaces(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException { int nrOfNamespaces = xmlReader.getNamespaceCount(); for (int i = 0; i < nrOfNamespaces; i++) { xmlWriter.writeNamespace(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i)); } } private void handleMessage(String xml) { System.out.println("===================================================="); System.out.println(xml); System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++"); } } |
To have this class running you will need the following libraries: stax-api-1.0.1.jar and wstx-asl-3.1.1.jar.
Have fun with it!