Overview
I was working on a Java application that uploads an excel sheet to a web service. The excel sheet is sent as a byte stream. With another webservice I was showing the contents of the excel on the web GUI. Recently I faced an issue, I was able to upload the file to the webservice and it was able to parse the file and store it in the db. But when getting the contents as a java object I sometimes got the below exception
org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x1a) was found in the element content of the document at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(Unknown Source) at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(Unknown Source) at javax.xml.parsers.DocumentBuilder.parse(Unknown Source)
So it looked from the error that it was an encoding issue. After some days I had the same issue with the text being uploaded through a text box where a user can copy text from the word file. So as to find the general solution for both the issues I started looking into the webservice handlers.
Webservice Handler to remove Invalid XML characters
Create a Webservice Handler
Here is what the oracle site says about webservice handlers
So after googling for a while I came up with a handler that filters out the invalid xml characters from the SOAP message.
package com.tak.soap.filter; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.util.Collections; import java.util.Set; import javax.xml.namespace.QName; import javax.xml.soap.MessageFactory; import javax.xml.soap.SOAPBody; import javax.xml.soap.SOAPEnvelope; import javax.xml.soap.SOAPException; import javax.xml.soap.SOAPMessage; import javax.xml.ws.handler.MessageContext; import javax.xml.ws.handler.soap.SOAPHandler; import javax.xml.ws.handler.soap.SOAPMessageContext; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; public class InvalidXmlCharacterHandler implements SOAPHandler<SOAPMessageContext> { private static final Logger LOGGER = LogManager .getLogger(InvalidXmlCharacterHandler.class); public static final String UTF_8 = "UTF-8"; @Override public boolean handleMessage(SOAPMessageContext context) { SOAPMessage soapMsg = context.getMessage(); if (soapMsg != null) { try { String filteredString = new InvalidXmlCharacterFilter() .filterInvalidReferences(getXmlMessage(soapMsg), getMethodName(context)).trim(); LOGGER.debug("input message:" + filteredString); MessageFactory mf = MessageFactory.newInstance(); InputStream is = new ByteArrayInputStream( filteredString.getBytes(Charset.forName(UTF_8))); SOAPMessage msg = mf .createMessage(soapMsg.getMimeHeaders(), is); LOGGER.debug("handled message:" + filteredString); msg.saveChanges(); context.setMessage(msg); } catch (Exception ex) { // catch all the exceptions, let the request pass, LOGGER.error("Error in handler: ", ex); } } return true; } private String getMethodName(SOAPMessageContext context) { try { SOAPEnvelope msg = context.getMessage().getSOAPPart().getEnvelope(); SOAPBody body = msg.getBody(); return body.getChildNodes().item(0).getLocalName(); } catch (Exception ex) { LOGGER.warn("could not get the method name: " + ex); } return ""; } public static String getXmlMessage(SOAPMessage message) throws SOAPException, IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); message.writeTo(os); final String encoding = (String) message .getProperty(SOAPMessage.CHARACTER_SET_ENCODING); if (encoding == null) { return new String(os.toByteArray(), UTF_8); } else { return new String(os.toByteArray(), encoding); } } @Override public boolean handleFault(SOAPMessageContext context) { return true; } @Override public void close(MessageContext context) { } @Override public Set<QName> getHeaders() { return Collections.emptySet(); } }
Remove Invalid XML characters
And the above class refers to InvalidXmlCharacterFilter. You can also use this class independently to remove invalid xml characters from an XML string.
import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; public class InvalidXmlCharacterFilter { private static final Logger LOGGER = LogManager.getLogger(InvalidXmlCharacterFilter.class); private final static Pattern referencesFilterPattern = Pattern .compile("&#(([0-9]+)|([xX]([0-9A-Fa-f]+)));"); private final static String replacementValue = ""; /** * Checks if a number refers to a valid XML characters. This has been * derived by experimenting with all the values in range 0 to 65533. Numbers * not falling in this range are not valid. Refer * http://www.w3.org/TR/html4/sgml/entities.html * * @param ref * @return */ static boolean isInvalidReference(int ref) { if (ref <= 8 || (ref >= 11 && ref <= 12) || (ref >= 14 && ref <= 31) || (ref >= 55296 && ref <= 57343) || ref >= 65534) { return true; } return false; } /** * Filter out all invalid references from the message payload * * @param messagePayload * @return resulting messagePayload after filtering out all invalid * references */ public String filterInvalidReferences(String messagePayload, String methodName) { if (null == referencesFilterPattern) { return messagePayload; } if (messagePayload == null) { return messagePayload; } StringBuffer afterFilter = new StringBuffer(); Matcher matcher = referencesFilterPattern.matcher(messagePayload); while (matcher.find()) { String decimal = matcher.group(2); String hexaDecimal = matcher.group(4); Integer ref = null; try { if (null != decimal) { ref = Integer.parseInt(decimal); } else if (null != hexaDecimal) { ref = Integer.parseInt(hexaDecimal, 16); } else { // This will never happen LOGGER.warn(methodName +": Matcher found an unexpected value [ " + matcher.group() + " ] in the matched reference. The value is neither decimal nor hexadecimal. "); continue; } } catch (Exception e) { LOGGER.warn(methodName + ": " + matcher.group(1) + " of " + matcher.group() + " is neither a valid decimal or hexadecimal number! ", e); continue; } if (isInvalidReference(ref)) { LOGGER.info("replacing invalid reference " + matcher.group() + " from WS "+ methodName + "."); matcher.appendReplacement(afterFilter, replacementValue); } } matcher.appendTail(afterFilter); return afterFilter.toString(); } }
Handler Declaration
And here is the handler declaration. Here you can see more
<?xml version="1.0" encoding="UTF-8"?> <endpoints xmlns='http://java.sun.com/xml/ns/jax-ws/ri/runtime' version='2.0'> <endpoint name='testinvalidcharacters' implementation='com.tak.ws.TestWebservice' url-pattern='/testinvalidcharacters' > <handler-chains xmlns="http://java.sun.com/xml/ns/javaee"> <handler-chain> <handler> <handler-class>com.tak.soap.filter.InvalidXmlCharacterHandler</handler-class> </handler> </handler-chain> </handler-chains> </endpoint> </endpoints>
Hi, I’m getting the same exception when I’m calling the getMessage method:
context.getMessage()
When I call the getMessage method I get the following error:
javax.xml.ws.WebServiceException: javax.xml.soap.SOAPException: org.xml.sax.SAXParseException; lineNumber: 1125; columnNumber: 122; Illegal character entity: expansion character (code 0x15
at [row,col {unknown-source}]: [1125,122]