--- old/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java 2018-09-17 11:38:49.941624784 -0700 +++ new/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java 2018-09-17 11:38:49.551588251 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved. */ /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -40,6 +40,7 @@ * because it is used from another package. * * @xsl.usage internal + * @LastModified: Sept 2018 */ public final class ToHTMLStream extends ToStream { @@ -1049,7 +1050,7 @@ String name, String value, ElemDesc elemDesc) - throws IOException + throws IOException, SAXException { writer.write(' '); @@ -1373,7 +1374,7 @@ */ public void writeAttrString( final java.io.Writer writer, String string, String encoding) - throws IOException + throws IOException, SAXException { final int end = string.length(); if (end > m_attrBuff.length) @@ -1425,13 +1426,16 @@ } else { - if (Encodings.isHighUTF16Surrogate(ch)) + if (Encodings.isHighUTF16Surrogate(ch) || + Encodings.isLowUTF16Surrogate(ch)) { - - writeUTF16Surrogate(ch, chars, i, end); - i++; // two input characters processed - // this increments by one and the for() - // loop itself increments by another one. + if (writeUTF16Surrogate(ch, chars, i, end) >= 0) { + // move the index if the low surrogate is consumed + // as writeUTF16Surrogate has written the pair + if (Encodings.isHighUTF16Surrogate(ch)) { + i++; + } + } } // The next is kind of a hack to keep from escaping in the case --- old/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java 2018-09-17 11:38:50.849709841 -0700 +++ new/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java 2018-09-17 11:38:50.446672090 -0700 @@ -51,7 +51,7 @@ * serializers (xml, html, text ...) that write output to a stream. * * @xsl.usage internal - * @LastModified: Feb 2018 + * @LastModified: Sept 2018 */ abstract public class ToStream extends SerializerBase { @@ -193,6 +193,8 @@ */ private boolean m_expandDTDEntities = true; + private char m_highSurrogate = 0; + /** * Default constructor */ @@ -953,45 +955,46 @@ * @param ch Character array. * @param i position Where the surrogate was detected. * @param end The end index of the significant characters. - * @return 0 if the pair of characters was written out as-is, - * the unicode code point of the character represented by - * the surrogate pair if an entity reference with that value - * was written out. + * @return the status of writing a surrogate pair. + * -1 -- nothing is written + * 0 -- the pair is written as-is + * code point -- the pair is written as an entity reference * * @throws IOException * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. */ protected int writeUTF16Surrogate(char c, char ch[], int i, int end) - throws IOException + throws IOException, SAXException { - int codePoint = 0; + int status = -1; if (i + 1 >= end) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { Integer.toHexString((int) c)})); + m_highSurrogate = c; + return status; + } + + char high, low; + if (m_highSurrogate == 0) { + high = c; + low = ch[i+1]; + status = 0; + } else { + high = m_highSurrogate; + low = c; + m_highSurrogate = 0; } - final char high = c; - final char low = ch[i+1]; if (!Encodings.isLowUTF16Surrogate(low)) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { - Integer.toHexString((int) c) - + " " - + Integer.toHexString(low)})); + throwIOE(high, low); } final Writer writer = m_writer; // If we make it to here we have a valid high, low surrogate pair - if (m_encodingInfo.isInEncoding(c,low)) { + if (m_encodingInfo.isInEncoding(high,low)) { // If the character formed by the surrogate pair // is in the encoding, so just write it out - writer.write(ch,i,2); + writer.write(new char[]{high, low}, 0, 2); } else { // Don't know what to do with this char, it is @@ -999,24 +1002,16 @@ // a surrogate pair, so write out as an entity ref final String encoding = getEncoding(); if (encoding != null) { - /* The output encoding is known, - * so somthing is wrong. - */ - codePoint = Encodings.toCodePoint(high, low); - // not in the encoding, so write out a character reference - writer.write('&'); - writer.write('#'); - writer.write(Integer.toString(codePoint)); - writer.write(';'); + status = writeCharRef(writer, high, low); } else { /* The output encoding is not known, * so just write it out as-is. */ - writer.write(ch, i, 2); + writer.write(new char[]{high, low}, 0, 2); } } // non-zero only if character reference was written out. - return codePoint; + return status; } /** @@ -1106,32 +1101,7 @@ } else if (isCData && (!escapingNotNeeded(c))) { - // if (i != 0) - if (m_cdataTagOpen) - closeCDATA(); - - // This needs to go into a function... - if (Encodings.isHighUTF16Surrogate(c)) - { - writeUTF16Surrogate(c, ch, i, end); - i++ ; // process two input characters - } - else - { - writer.write("&#"); - - String intStr = Integer.toString((int) c); - - writer.write(intStr); - writer.write(';'); - } - - // if ((i != 0) && (i < (end - 1))) - // if (!m_cdataTagOpen && (i < (end - 1))) - // { - // writer.write(CDATA_DELIMITER_OPEN); - // m_cdataTagOpen = true; - // } + i = handleEscaping(writer, c, ch, i, end); } else if ( isCData @@ -1155,29 +1125,44 @@ } writer.write(c); } - - // This needs to go into a function... - else if (Encodings.isHighUTF16Surrogate(c)) - { - if (m_cdataTagOpen) - closeCDATA(); - writeUTF16Surrogate(c, ch, i, end); - i++; // process two input characters + else { + i = handleEscaping(writer, c, ch, i, end); } - else - { - if (m_cdataTagOpen) - closeCDATA(); - writer.write("&#"); + } + } - String intStr = Integer.toString((int) c); + } - writer.write(intStr); - writer.write(';'); + /** + * Handles escaping, writes either with a surrogate pair or a character + * reference. + * + * @param c the current char + * @param ch the character array + * @param i the current position + * @param end the end index of the array + * @return the next index + * + * @throws IOException + * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. + */ + private int handleEscaping(Writer writer, char c, char ch[], int i, int end) + throws IOException, SAXException { + if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c)) + { + if (writeUTF16Surrogate(c, ch, i, end) >= 0) { + // move the index if the low surrogate is consumed + // as writeUTF16Surrogate has written the pair + if (Encodings.isHighUTF16Surrogate(c)) { + i++ ; } } } - + else + { + writeCharRef(writer, c); + } + return i; } /** @@ -1246,7 +1231,7 @@ m_elemContext.m_startTagOpen = false; } - if (shouldIndent()) + if (!m_cdataTagOpen && shouldIndent()) indent(); boolean writeCDataBrackets = @@ -1644,7 +1629,7 @@ int i, char ch, int lastDirty, - boolean fromTextNode) throws IOException + boolean fromTextNode) throws IOException, SAXException { int startClean = lastDirty + 1; // if we have some clean characters accumulated @@ -1723,54 +1708,40 @@ int len, boolean fromTextNode, boolean escLF) - throws IOException + throws IOException, SAXException { int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF); if (i == pos) { + if (m_highSurrogate != 0) { + if (!(Encodings.isLowUTF16Surrogate(ch))) { + throwIOE(m_highSurrogate, ch); + } + writeCharRef(writer, m_highSurrogate, ch); + m_highSurrogate = 0; + return ++pos; + } + if (Encodings.isHighUTF16Surrogate(ch)) { - - // Should be the UTF-16 low surrogate of the hig/low pair. - char next; - // Unicode code point formed from the high/low pair. - int codePoint = 0; - if (i + 1 >= len) { - throw new IOException( - Utils.messages.createMessage( - MsgKey.ER_INVALID_UTF16_SURROGATE, - new Object[] { Integer.toHexString(ch)})); - //"Invalid UTF-16 surrogate detected: " - - //+Integer.toHexString(ch)+ " ?"); + // save for the next read + m_highSurrogate = ch; + pos++; } else { - next = chars[++i]; - + // the next should be the UTF-16 low surrogate of the hig/low pair. + char next = chars[++i]; if (!(Encodings.isLowUTF16Surrogate(next))) - throw new IOException( - Utils.messages.createMessage( - MsgKey - .ER_INVALID_UTF16_SURROGATE, - new Object[] { - Integer.toHexString(ch) - + " " - + Integer.toHexString(next)})); - //"Invalid UTF-16 surrogate detected: " - - //+Integer.toHexString(ch)+" "+Integer.toHexString(next)); - codePoint = Encodings.toCodePoint(ch,next); - } - - writer.write("&#"); - writer.write(Integer.toString(codePoint)); - writer.write(';'); - pos += 2; // count the two characters that went into writing out this entity + throwIOE(ch, next); + + writeCharRef(writer, ch, next); + pos += 2; // count the two characters that went into writing out this entity + } } else { @@ -1782,18 +1753,14 @@ if (isCharacterInC0orC1Range(ch) || (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch))) { - writer.write("&#"); - writer.write(Integer.toString(ch)); - writer.write(';'); + writeCharRef(writer, ch); } else if ((!escapingNotNeeded(ch) || ( (fromTextNode && m_charInfo.isSpecialTextChar(ch)) || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))) - && m_elemContext.m_currentElemDepth > 0) + && m_elemContext.m_currentElemDepth > 0) { - writer.write("&#"); - writer.write(Integer.toString(ch)); - writer.write(';'); + writeCharRef(writer, ch); } else { @@ -1807,6 +1774,45 @@ } /** + * Writes out a character reference. + * @param writer the writer + * @param c the character + * @throws IOException + */ + private void writeCharRef(Writer writer, char c) throws IOException, SAXException { + if (m_cdataTagOpen) + closeCDATA(); + writer.write("&#"); + writer.write(Integer.toString(c)); + writer.write(';'); + } + + /** + * Writes out a pair of surrogates as a character reference + * @param writer the writer + * @param high the high surrogate + * @param low the low surrogate + * @throws IOException + */ + private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException { + if (m_cdataTagOpen) + closeCDATA(); + // Unicode code point formed from the high/low pair. + int codePoint = Encodings.toCodePoint(high, low); + writer.write("&#"); + writer.write(Integer.toString(codePoint)); + writer.write(';'); + return codePoint; + } + + private void throwIOE(char ch, char next) throws IOException { + throw new IOException(Utils.messages.createMessage( + MsgKey.ER_INVALID_UTF16_SURROGATE, + new Object[] {Integer.toHexString(ch) + " " + + Integer.toHexString(next)})); + } + + /** * Receive notification of the beginning of an element, although this is a * SAX method additional namespace or attribute information can occur before * or after this call, that is associated with this element. @@ -2053,7 +2059,7 @@ Writer writer, String string, String encoding) - throws IOException + throws IOException, SAXException { final int len = string.length(); if (len > m_attrBuff.length) --- old/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToTextStream.java 2018-09-17 11:38:51.778796866 -0700 +++ new/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToTextStream.java 2018-09-17 11:38:51.384759958 -0700 @@ -1,6 +1,5 @@ /* - * reserved comment block - * DO NOT REMOVE OR ALTER! + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. */ /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -34,6 +33,7 @@ * This class converts SAX or SAX-like calls to a * serialized document for xsl:output method of "text". * @xsl.usage internal + * @LastModified: Sept 2018 */ public final class ToTextStream extends ToStream { @@ -295,23 +295,32 @@ } else if (m_encodingInfo.isInEncoding(c)) { writer.write(c); // one input char processed - } else if (Encodings.isHighUTF16Surrogate(c)) { + } else if (Encodings.isHighUTF16Surrogate(c) || + Encodings.isLowUTF16Surrogate(c)) { final int codePoint = writeUTF16Surrogate(c, ch, i, end); - if (codePoint != 0) { - // I think we can just emit the message, - // not crash and burn. - final String integralValue = Integer.toString(codePoint); - final String msg = Utils.messages.createMessage( - MsgKey.ER_ILLEGAL_CHARACTER, - new Object[] { integralValue, encoding }); - - //Older behavior was to throw the message, - //but newer gentler behavior is to write a message to System.err - //throw new SAXException(msg); - System.err.println(msg); - + if (codePoint >= 0) { + // move the index if the low surrogate is consumed + // as writeUTF16Surrogate has written the pair + if (Encodings.isHighUTF16Surrogate(c)) { + i++; + } + + // printing to the console is not appropriate, but will leave + // it as is for compatibility. + if (codePoint >0) { + // I think we can just emit the message, + // not crash and burn. + final String integralValue = Integer.toString(codePoint); + final String msg = Utils.messages.createMessage( + MsgKey.ER_ILLEGAL_CHARACTER, + new Object[] { integralValue, encoding }); + + //Older behavior was to throw the message, + //but newer gentler behavior is to write a message to System.err + //throw new SAXException(msg); + System.err.println(msg); + } } - i++; // two input chars processed } else { // Don't know what to do with this char, it is // not in the encoding and not a high char in --- /dev/null 2018-04-28 07:26:06.641956972 -0700 +++ new/test/jaxp/javax/xml/jaxp/unittest/transform/JDK8207760.java 2018-09-17 11:38:52.288844640 -0700 @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package transform; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.StringReader; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; + +import org.testng.Assert; +import org.testng.annotations.Listeners; +import org.testng.annotations.Test; +import java.util.Random; +import javax.xml.transform.OutputKeys; +import org.testng.annotations.DataProvider; + +/* + * @test + * @library /javax/xml/jaxp/libs /javax/xml/jaxp/unittest + * @run testng/othervm transform.JDK8207760 + * @summary Verifies that a surrogate pair at the edge of a buffer is properly handled + * @bug 8207760 + */ +@Listeners({jaxp.library.FilePolicy.class}) +public class JDK8207760 { + final String xsl8207760 = + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n"; + + final String xsl8207760_2 = "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + ""; + + final String xsl8207760_3 = "\n" + + "\n" + + " \n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + ""; + + @DataProvider(name = "xsls") + public Object[][] getDataBug8207760_cdata() { + return new Object[][]{ + {xsl8207760_2}, + {xsl8207760_3}, + }; + } + + /* + * @bug 8207760 + * Verifies that a surrogate pair at the edge of a buffer is properly handled + * when serializing into a Character section. + */ + @Test + public final void testBug8207760() throws Exception { + String[] xmls = prepareXML(false); + Transformer t = createTransformerFromInputstream( + new ByteArrayInputStream(xsl8207760.getBytes(StandardCharsets.UTF_8))); + t.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name()); + StringWriter sw = new StringWriter(); + t.transform(new StreamSource(new StringReader(xmls[0])), new StreamResult(sw)); + Assert.assertEquals(sw.toString().replaceAll(System.lineSeparator(), "\n"), xmls[1]); + } + + /* + * @bug 8207760 + * Verifies that a surrogate pair at the edge of a buffer is properly handled + * when serializing into a CDATA section. + */ + @Test(dataProvider = "xsls") + public final void testBug8207760_cdata(String xsl) throws Exception { + String[] xmls = prepareXML(true); + Transformer t = createTransformerFromInputstream( + new ByteArrayInputStream(xsl.getBytes(StandardCharsets.UTF_8))); + t.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name()); + StringWriter sw = new StringWriter(); + t.transform(new StreamSource(new StringReader(xmls[0])), new StreamResult(sw)); + Assert.assertEquals(sw.toString().replaceAll(System.lineSeparator(), "\n"), xmls[1]); + } + + private String[] prepareXML(boolean cdata) { + String xml = ""; + if (cdata) { + xml += ""; + if (cdata) { + tail = "abc 123 ]]>"; + } + String temp = generateString(1023); + xml = xml + temp + '\uD83C' + '\uDF42' + tail; + //xml = xml + temp + tail; + String expected = (!cdata) ? "" + temp + "🍂" + tail + : xml; + + return new String[]{xml, expected}; + } + + static final char[] CHARS = "abcdefghijklmnopqrstuvwxyz \n".toCharArray(); + StringBuilder sb = new StringBuilder(1024 << 4); + Random random = new Random(); + + private String generateString(int size) { + sb.setLength(0); + for (int i = 0; i < size; i++) { + char c = CHARS[random.nextInt(CHARS.length)]; + sb.append(c); + } + + return sb.toString(); + } + + private Transformer createTransformerFromInputstream(InputStream xslStream) + throws TransformerException { + return TransformerFactory.newInstance().newTransformer(new StreamSource(xslStream)); + } +}