# HG changeset patch # User jlaskey # Date 1536950077 10800 # Fri Sep 14 15:34:37 2018 -0300 # Node ID 15d46db30d2ab5b17089e791de2d0aa956feea6b # Parent ccea318862aef53cb145933cd11a78845b2da353 8202442: String::unescape Reviewed-by: smarks, rriggs, sherman diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -2972,6 +2972,88 @@ } /** + * Translates all Unicode escapes and escape sequences in this string into + * characters represented by those escapes specified in sections 3.3 and + * 3.10.6 of the The Java™ Language Specification. + *

+ * Each unicode escape in the form \unnnn is translated to the + * unicode character whose code point is {@code 0xnnnn}. Care should be + * taken when using UTF-16 surrogate pairs to ensure that the high + * surrogate (U+D800..U+DBFF) is immediately followed by a low surrogate + * (U+DC00..U+DFFF) otherwise a + * {@link java.nio.charset.CharacterCodingException} may occur during UTF-8 + * decoding. + *

+ * Backslash escape sequences are translated as follows; + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Escape sequences
Escape Name Unicode
{@code \b} backspace U+0008
{@code \t} horizontal tab U+0009
{@code \n} line feed U+000A
{@code \f} form feed U+000C
{@code \r} carriage return U+000D
{@code \"} double quote U+0022
{@code \'} single quote U+0027
{@code \\} backslash U+005C
+ *

Escape sequences
Escape	Name	Unicode
{@code \b}	backspace	U+0008
{@code \t}	horizontal tab	U+0009
{@code \n}	line feed	U+000A
{@code \f}	form feed	U+000C
{@code \r}	carriage return	U+000D
{@code \"}	double quote	U+0022
{@code \'}	single quote	U+0027
{@code \\}	backslash	U+005C

+ * Octal escapes {@code \0 - \377} are translated to their code + * point equivalents. + * + * @return String with all escapes translated. + * + * @throws IllegalArgumentException when escape sequence is malformed. + * + * @since 12 + * + * @deprecated Preview feature associated with Raw String Literals. + * Use at your own risk. + */ + @Deprecated(forRemoval=true, since="12") + public String unescape() throws IllegalArgumentException { + return isLatin1() ? StringLatin1.unescape(value) : StringUTF16.unescape(value); + } + + /** * This object (which is already a string!) is itself returned. * * @return the string itself. diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -857,4 +857,83 @@ return cs; } } + + static String unescape(byte[] value) throws IllegalArgumentException { + int length = value.length; + byte[] chars = new byte[length]; + int from = 0; + int to = 0; + while (from < length) { + char ch = getChar(value, from++); + if (ch == '\\' && from < length) { + ch = getChar(value, from++); + if (ch == 'u') { + while (from < length && getChar(value, from) == 'u') { + from++; + } + if (length <= from + 3) { + throw new IllegalArgumentException("unicode escape sequence truncated at end of string, pos = " + from); + } + int code = (Character.digit(getChar(value, from + 0), 16) << 12) | + (Character.digit(getChar(value, from + 1), 16) << 8) | + (Character.digit(getChar(value, from + 2), 16) << 4) | + Character.digit(getChar(value, from + 3), 16); + if (code < 0) { + throw new IllegalArgumentException("unicode escape sequence contains non hexadecimal digits, pos = " + from); + } + if (canEncode(code)) { + ch = (char)code; + from += 4; + } else { + return StringUTF16.unescape(inflate(value, 0, length)); + } + } else { + switch (ch) { + case 'b': + ch = '\b'; + break; + case 'f': + ch = '\f'; + break; + case 'n': + ch = '\n'; + break; + case 'r': + ch = '\r'; + break; + case 't': + ch = '\t'; + break; + case 'u': + chars[to++] = (byte)'\\'; + break; + case '\\': + ch = '\\'; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + int code = ch - '0'; + for (int i = 0; i < 2 && from < length; i++) { + int digit = Character.digit(getChar(value, from), 8); + if (digit < 0) { + break; + } + from++; + code = code << 3 | digit; + } + if (0377 < code) { + throw new IllegalArgumentException("octal escape sequence value is too large, pos = " + from); + } + ch = (char)code; + break; + default: + throw new IllegalArgumentException("unrecognized escape sequence, pos = " + from); + } + } + } + chars[to++] = (byte)ch; + } + return new String(Arrays.copyOfRange(chars, 0, to), LATIN1); + } + } diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -1548,4 +1548,77 @@ String.checkBoundsOffCount(offset, count, length(val)); } + static String unescape(byte[] value) throws IllegalArgumentException { + int length = value.length >>> 1; + char[] chars = new char[length]; + int from = 0; + int to = 0; + while (from < length) { + char ch = getChar(value, from++); + if (ch == '\\' && from < length) { + ch = getChar(value, from++); + if (ch == 'u') { + while (from < length && getChar(value, from) == 'u') { + from++; + } + if (length <= from + 3) { + throw new IllegalArgumentException("unicode escape sequence truncated at end of string, pos = " + from); + } + int code = (Character.digit(getChar(value, from + 0), 16) << 12) | + (Character.digit(getChar(value, from + 1), 16) << 8) | + (Character.digit(getChar(value, from + 2), 16) << 4) | + Character.digit(getChar(value, from + 3), 16); + if (code < 0) { + throw new IllegalArgumentException("unicode escape sequence contains non hexadecimal digits, pos = " + from); + } + ch = (char)code; + from += 4; + } else { + switch (ch) { + case 'b': + ch = '\b'; + break; + case 'f': + ch = '\f'; + break; + case 'n': + ch = '\n'; + break; + case 'r': + ch = '\r'; + break; + case 't': + ch = '\t'; + break; + case 'u': + chars[to++] = '\\'; + break; + case '\\': + ch = '\\'; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + int code = ch - '0'; + for (int i = 0; i < 2 && from < length; i++) { + int digit = Character.digit(getChar(value, from), 8); + if (digit < 0) { + break; + } + from++; + code = code << 3 | digit; + } + if (0377 < code) { + throw new IllegalArgumentException("octal escape sequence value is too large, pos = " + from); + } + ch = (char)code; + break; + default: + throw new IllegalArgumentException("unrecognized escape sequence, pos = " + from); + } + } + } + chars[to++] = ch; + } + return new String(chars, 0, to); + } } diff --git a/test/jdk/java/lang/String/Unescape.java b/test/jdk/java/lang/String/Unescape.java new file mode 100644 --- /dev/null +++ b/test/jdk/java/lang/String/Unescape.java @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @summary Unit tests for String#unescape + * @compile --enable-preview -source 12 -encoding utf8 Unescape.java + * @run main/othervm --enable-preview Unescape + */ + +@SuppressWarnings( "deprecation" ) +public class Unescape { + public static void main(String[] args) { + test1(); + test2(); + } + + /* + * Test unescaping functionality. + */ + static void test1() { + equal(`a\tb\u0063`, "a\\tb\\u0063"); + equal(`a\tb\u0063`.unescape(), "a\tbc"); + equal(`a\tb\u2022`, "a\\tb\\u2022"); + equal(`a\tb\u2022`.unescape(), "a\tb\u2022"); + equal(`\0\12\012`.unescape(), "\0\12\012"); + equal(`•\0\12\012`.unescape(), "\u2022\0\12\012"); + + equal(`\b`.unescape(), "\b"); + equal(`\f`.unescape(), "\f"); + equal(`\n`.unescape(), "\n"); + equal(`\r`.unescape(), "\r"); + equal(`\t`.unescape(), "\t"); + equal(`\0`.unescape(), "\0"); + equal(`\7`.unescape(), "\7"); + equal(`\12`.unescape(), "\12"); + equal(`\012`.unescape(), "\012"); + equal(`\u0000`.unescape(), "\u0000"); + equal(`\u2022`.unescape(), "\u2022"); + equal(`•\b`.unescape(), "•\b"); + equal(`•\f`.unescape(), "•\f"); + equal(`•\n`.unescape(), "•\n"); + equal(`•\r`.unescape(), "•\r"); + equal(`•\t`.unescape(), "•\t"); + equal(`•\0`.unescape(), "•\0"); + equal(`•\7`.unescape(), "•\7"); + equal(`•\12`.unescape(), "•\12"); + equal(`•\177`.unescape(), "•\177"); + equal(`•\u0000`.unescape(), "•\u0000"); + equal(`•\u2022`.unescape(), "•\u2022"); + } + + /* + * Test for IllegalArgumentException. + */ + static void test2() { + wellFormed(`\b`); + wellFormed(`\f`); + wellFormed(`\n`); + wellFormed(`\r`); + wellFormed(`\t`); + wellFormed(`\0`); + wellFormed(`\7`); + wellFormed(`\12`); + wellFormed(`\012`); + wellFormed(`\u0000`); + wellFormed(`\u2022`); + wellFormed(`•\b`); + wellFormed(`•\f`); + wellFormed(`•\n`); + wellFormed(`•\r`); + wellFormed(`•\t`); + wellFormed(`•\0`); + wellFormed(`•\7`); + wellFormed(`•\12`); + wellFormed(`•\012`); + wellFormed(`•\u0000`); + wellFormed(`•\u2022`); + + malformed(`\x`); + malformed(`\+`); + malformed(`\u`); + malformed(`\uuuuu`); + malformed(`\u2`); + malformed(`\u20`); + malformed(`\u202`); + malformed(`\u2 `); + malformed(`\u20 `); + malformed(`\u202 `); + malformed(`\uuuuu2`); + malformed(`\uuuuu20`); + malformed(`\uuuuu202`); + malformed(`\uuuuu2 `); + malformed(`\uuuuu20 `); + malformed(`\uuuuu202 `); + malformed(`\uG`); + malformed(`\u2G`); + malformed(`\u20G`); + malformed(`\uG `); + malformed(`\u2G `); + malformed(`\u20G `); + malformed(`\uuuuuG`); + malformed(`\uuuuu2G`); + malformed(`\uuuuu20G`); + malformed(`\uuuuuG `); + malformed(`\uuuuu2G `); + malformed(`\uuuuu20G `); + + malformed(`•\x`); + malformed(`•\+`); + malformed(`•\u`); + malformed(`•\uuuuu`); + malformed(`•\u2`); + malformed(`•\u20`); + malformed(`•\u202`); + malformed(`•\u2 `); + malformed(`•\u20 `); + malformed(`•\u202 `); + malformed(`•\uuuuu2`); + malformed(`•\uuuuu20`); + malformed(`•\uuuuu202`); + malformed(`•\uuuuu2 `); + malformed(`•\uuuuu20 `); + malformed(`•\uuuuu202 `); + malformed(`•\uG`); + malformed(`•\u2G`); + malformed(`•\u20G`); + malformed(`•\uG `); + malformed(`•\u2G `); + malformed(`•\u20G `); + malformed(`•\uuuuuG`); + malformed(`•\uuuuu2G`); + malformed(`•\uuuuu20G`); + malformed(`•\uuuuuG `); + malformed(`•\uuuuu2G `); + malformed(`•\uuuuu20G `); + } + + /* + * Report difference in result. + */ + static void report(String message, String inputTag, String input, + String outputTag, String output) { + System.err.println(message); + System.err.println(); + System.err.println(inputTag); + System.err.println(input.replaceAll(" ", ".")); + System.err.println(); + System.err.println(outputTag); + System.err.println(output.replaceAll(" ", ".")); + throw new RuntimeException(); + } + + /* + * Raise an exception if the two inputs are not equivalent. + */ + static void equal(String input, String expected) { + if (input == null || expected == null || !expected.equals(input)) { + report("Failed equal", "Input:", input, "Expected:", expected); + } + } + + /* + * Raise an exception if the string contains a malformed escape. + */ + static void wellFormed(String rawString) { + try { + rawString.unescape(); + } catch (IllegalArgumentException ex) { + System.err.println("Failed wellFormed"); + System.err.println(rawString); + throw new RuntimeException(); + } + } + + /* + * Raise an exception if the string does not contain a malformed escape. + */ + static void malformed(String rawString) { + try { + rawString.unescape(); + System.err.println("Failed malformed"); + System.err.println(rawString); + throw new RuntimeException(); + } catch (IllegalArgumentException ex) { + // incorrectly formed escapes + } + } +}