--- old/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java 2018-09-17 11:38:50.849709841 -0700 +++ new/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java 2018-09-17 11:38:50.446672090 -0700 @@ -51,7 +51,7 @@ * serializers (xml, html, text ...) that write output to a stream. * * @xsl.usage internal - * @LastModified: Feb 2018 + * @LastModified: Sept 2018 */ abstract public class ToStream extends SerializerBase { @@ -193,6 +193,8 @@ */ private boolean m_expandDTDEntities = true; + private char m_highSurrogate = 0; + /** * Default constructor */ @@ -953,45 +955,46 @@ * @param ch Character array. * @param i position Where the surrogate was detected. * @param end The end index of the significant characters. - * @return 0 if the pair of characters was written out as-is, - * the unicode code point of the character represented by - * the surrogate pair if an entity reference with that value - * was written out. + * @return the status of writing a surrogate pair. + * -1 -- nothing is written + * 0 -- the pair is written as-is + * code point -- the pair is written as an entity reference * * @throws IOException * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. */ protected int writeUTF16Surrogate(char c, char ch[], int i, int end) - throws IOException + throws IOException, SAXException { - int codePoint = 0; + int status = -1; if (i + 1 >= end) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { Integer.toHexString((int) c)})); + m_highSurrogate = c; + return status; + } + + char high, low; + if (m_highSurrogate == 0) { + high = c; + low = ch[i+1]; + status = 0; + } else { + high = m_highSurrogate; + low = c; + m_highSurrogate = 0; } - final char high = c; - final char low = ch[i+1]; if (!Encodings.isLowUTF16Surrogate(low)) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { - Integer.toHexString((int) c) - + " " - + Integer.toHexString(low)})); + throwIOE(high, low); } final Writer writer = m_writer; // If we make it to here we have a valid high, low surrogate pair - if (m_encodingInfo.isInEncoding(c,low)) { + if (m_encodingInfo.isInEncoding(high,low)) { // If the character formed by the surrogate pair // is in the encoding, so just write it out - writer.write(ch,i,2); + writer.write(new char[]{high, low}, 0, 2); } else { // Don't know what to do with this char, it is @@ -999,24 +1002,16 @@ // a surrogate pair, so write out as an entity ref final String encoding = getEncoding(); if (encoding != null) { - /* The output encoding is known, - * so somthing is wrong. - */ - codePoint = Encodings.toCodePoint(high, low); - // not in the encoding, so write out a character reference - writer.write('&'); - writer.write('#'); - writer.write(Integer.toString(codePoint)); - writer.write(';'); + status = writeCharRef(writer, high, low); } else { /* The output encoding is not known, * so just write it out as-is. */ - writer.write(ch, i, 2); + writer.write(new char[]{high, low}, 0, 2); } } // non-zero only if character reference was written out. - return codePoint; + return status; } /** @@ -1106,32 +1101,7 @@ } else if (isCData && (!escapingNotNeeded(c))) { - // if (i != 0) - if (m_cdataTagOpen) - closeCDATA(); - - // This needs to go into a function... - if (Encodings.isHighUTF16Surrogate(c)) - { - writeUTF16Surrogate(c, ch, i, end); - i++ ; // process two input characters - } - else - { - writer.write("&#"); - - String intStr = Integer.toString((int) c); - - writer.write(intStr); - writer.write(';'); - } - - // if ((i != 0) && (i < (end - 1))) - // if (!m_cdataTagOpen && (i < (end - 1))) - // { - // writer.write(CDATA_DELIMITER_OPEN); - // m_cdataTagOpen = true; - // } + i = handleEscaping(writer, c, ch, i, end); } else if ( isCData @@ -1155,29 +1125,44 @@ } writer.write(c); } - - // This needs to go into a function... - else if (Encodings.isHighUTF16Surrogate(c)) - { - if (m_cdataTagOpen) - closeCDATA(); - writeUTF16Surrogate(c, ch, i, end); - i++; // process two input characters + else { + i = handleEscaping(writer, c, ch, i, end); } - else - { - if (m_cdataTagOpen) - closeCDATA(); - writer.write("&#"); + } + } - String intStr = Integer.toString((int) c); + } - writer.write(intStr); - writer.write(';'); + /** + * Handles escaping, writes either with a surrogate pair or a character + * reference. + * + * @param c the current char + * @param ch the character array + * @param i the current position + * @param end the end index of the array + * @return the next index + * + * @throws IOException + * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. + */ + private int handleEscaping(Writer writer, char c, char ch[], int i, int end) + throws IOException, SAXException { + if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c)) + { + if (writeUTF16Surrogate(c, ch, i, end) >= 0) { + // move the index if the low surrogate is consumed + // as writeUTF16Surrogate has written the pair + if (Encodings.isHighUTF16Surrogate(c)) { + i++ ; } } } - + else + { + writeCharRef(writer, c); + } + return i; } /** @@ -1246,7 +1231,7 @@ m_elemContext.m_startTagOpen = false; } - if (shouldIndent()) + if (!m_cdataTagOpen && shouldIndent()) indent(); boolean writeCDataBrackets = @@ -1644,7 +1629,7 @@ int i, char ch, int lastDirty, - boolean fromTextNode) throws IOException + boolean fromTextNode) throws IOException, SAXException { int startClean = lastDirty + 1; // if we have some clean characters accumulated @@ -1723,54 +1708,40 @@ int len, boolean fromTextNode, boolean escLF) - throws IOException + throws IOException, SAXException { int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF); if (i == pos) { + if (m_highSurrogate != 0) { + if (!(Encodings.isLowUTF16Surrogate(ch))) { + throwIOE(m_highSurrogate, ch); + } + writeCharRef(writer, m_highSurrogate, ch); + m_highSurrogate = 0; + return ++pos; + } + if (Encodings.isHighUTF16Surrogate(ch)) { - - // Should be the UTF-16 low surrogate of the hig/low pair. - char next; - // Unicode code point formed from the high/low pair. - int codePoint = 0; - if (i + 1 >= len) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { Integer.toHexString(ch)})); - //"Invalid UTF-16 surrogate detected: " - - //+Integer.toHexString(ch)+ " ?"); + // save for the next read + m_highSurrogate = ch; + pos++; } else { - next = chars[++i]; - + // the next should be the UTF-16 low surrogate of the hig/low pair. + char next = chars[++i]; if (!(Encodings.isLowUTF16Surrogate(next))) - throw new IOException( - Utils.messages.createMessage( - MsgKey - .ER_INVALID_UTF16_SURROGATE, - new Object[] { - Integer.toHexString(ch) - + " " - + Integer.toHexString(next)})); - //"Invalid UTF-16 surrogate detected: " - - //+Integer.toHexString(ch)+" "+Integer.toHexString(next)); - codePoint = Encodings.toCodePoint(ch,next); - } - - writer.write("&#"); - writer.write(Integer.toString(codePoint)); - writer.write(';'); - pos += 2; // count the two characters that went into writing out this entity + throwIOE(ch, next); + + writeCharRef(writer, ch, next); + pos += 2; // count the two characters that went into writing out this entity + } } else { @@ -1782,18 +1753,14 @@ if (isCharacterInC0orC1Range(ch) || (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch))) { - writer.write("&#"); - writer.write(Integer.toString(ch)); - writer.write(';'); + writeCharRef(writer, ch); } else if ((!escapingNotNeeded(ch) || ( (fromTextNode && m_charInfo.isSpecialTextChar(ch)) || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))) - && m_elemContext.m_currentElemDepth > 0) + && m_elemContext.m_currentElemDepth > 0) { - writer.write("&#"); - writer.write(Integer.toString(ch)); - writer.write(';'); + writeCharRef(writer, ch); } else { @@ -1807,6 +1774,45 @@ } /** + * Writes out a character reference. + * @param writer the writer + * @param c the character + * @throws IOException + */ + private void writeCharRef(Writer writer, char c) throws IOException, SAXException { + if (m_cdataTagOpen) + closeCDATA(); + writer.write("&#"); + writer.write(Integer.toString(c)); + writer.write(';'); + } + + /** + * Writes out a pair of surrogates as a character reference + * @param writer the writer + * @param high the high surrogate + * @param low the low surrogate + * @throws IOException + */ + private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException { + if (m_cdataTagOpen) + closeCDATA(); + // Unicode code point formed from the high/low pair. + int codePoint = Encodings.toCodePoint(high, low); + writer.write("&#"); + writer.write(Integer.toString(codePoint)); + writer.write(';'); + return codePoint; + } + + private void throwIOE(char ch, char next) throws IOException { + throw new IOException(Utils.messages.createMessage( + MsgKey.ER_INVALID_UTF16_SURROGATE, + new Object[] {Integer.toHexString(ch) + " " + + Integer.toHexString(next)})); + } + + /** * Receive notification of the beginning of an element, although this is a * SAX method additional namespace or attribute information can occur before * or after this call, that is associated with this element. @@ -2053,7 +2059,7 @@ Writer writer, String string, String encoding) - throws IOException + throws IOException, SAXException { final int len = string.length(); if (len > m_attrBuff.length)