# HG changeset patch # User igerasim # Date 1567649348 25200 # Wed Sep 04 19:09:08 2019 -0700 # Node ID 021d2408e69f06c0588903b8011eadfb7ed36000 # Parent f016cc0874f03dbeec5e275f45247ee5724acad3 [mq]: 8230365-Pattern-for-a-control-char-matches-non-control-characters diff --git a/src/java.base/share/classes/java/util/regex/ASCII.java b/src/java.base/share/classes/java/util/regex/ASCII.java --- a/src/java.base/share/classes/java/util/regex/ASCII.java +++ b/src/java.base/share/classes/java/util/regex/ASCII.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -259,6 +259,10 @@ return isType(ch, WORD); } + static boolean isCntrlId(int ch) { + return ((ch-0x3f)|(0x5f-ch)) >= 0; + } + static int toDigit(int ch) { return (ctype[ch & 0x7F] & 0x3F); } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -45,6 +45,8 @@ import jdk.internal.util.ArraysSupport; +import sun.security.action.GetPropertyAction; + /** * A compiled representation of a regular expression. * @@ -130,7 +132,9 @@ * {@code \e} * The escape character ('\u001B') * {@code \c}x - * The control character corresponding to x + * The control character corresponding to x + * (x is either {@code A} through {@code Z} or one of + * {@code ?}, {@code @}, {@code [}, {@code \\}, {@code ]}, {@code ^}, {@code _}) * * Character classes * @@ -151,7 +155,7 @@ * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction) * {@code [a-z&&[^m-p]]} * {@code a} through {@code z}, - * and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction) + * and not {@code m} through {@code p}: {@code [a-lq-z]} (subtraction) * * Predefined character classes * @@ -1058,6 +1062,25 @@ private transient boolean hasSupplementary; /** + * If {@code true} then only limited list of chars is accepted as + * control-character IDs in regular expressions of form "\\cX": + * 'A' through 'Z', '?', '@', '[', '\\', ']', '^', '_'. + * Otherwise, no restrictions on the IDs are exposed. + */ + private static final boolean RESTRICTED_CONTROL_CHAR_IDS = Boolean.valueOf( + GetPropertyAction.privilegedGetProperty( + "jdk.util.regex.restrictedControlCharIds", "true")); + + /** + * If {@code true} then lower-case control-character ids are mapped to the + * their upper-case counterparts. + * For example, "\\ca" will be the same as "\\cA". + */ + private static final boolean ALLOW_LOWERCASE_CONTROL_CHAR_IDS = Boolean.valueOf( + GetPropertyAction.privilegedGetProperty( + "jdk.util.regex.allowLowerCaseControlCharIds", "false")); + + /** * Compiles the given regular expression into a pattern. * * @param regex @@ -3325,7 +3348,11 @@ */ private int c() { if (cursor < patternLength) { - return read() ^ 64; + int ch = read(); + if (ALLOW_LOWERCASE_CONTROL_CHAR_IDS && ASCII.isLower(ch)) + return ch ^ 0x60; + if (!RESTRICTED_CONTROL_CHAR_IDS || ASCII.isCntrlId(ch)) + return ch ^ 0x40; } throw error("Illegal control escape sequence"); } diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -35,7 +35,7 @@ * 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819 * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 - * 8194667 8197462 8184692 8221431 8224789 8228352 + * 8194667 8197462 8184692 8221431 8224789 8228352 8230365 * * @library /test/lib * @library /lib/testlibrary/java/lang @@ -57,6 +57,7 @@ import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Random; import java.util.Scanner; @@ -186,6 +187,7 @@ invalidGroupName(); illegalRepetitionRange(); surrogatePairWithCanonEq(); + controlCharacters(); if (failure) { throw new @@ -4984,4 +4986,75 @@ } report("surrogatePairWithCanonEq"); } + + private static void controlCharacters() { + char[] contolCharsPairs = { '@', 0x00, + 'A', 0x01, 'B', 0x02, 'C', 0x03, 'D', 0x04, 'E', 0x05, 'F', 0x06, + 'G', 0x07, 'H', 0x08, 'I', 0x09, 'J', 0x0a, 'K', 0x0b, 'L', 0x0c, + 'M', 0x0d, 'N', 0x0e, 'O', 0x0f, 'P', 0x10, 'Q', 0x11, 'R', 0x12, + 'S', 0x13, 'T', 0x14, 'U', 0x15, 'V', 0x16, 'W', 0x17, 'X', 0x18, + 'Y', 0x19, 'Z', 0x1a, + '[', 0x1b, '\\', 0x1c, ']', 0x1d, '^', 0x1e, '_', 0x1f, '?', 0x7f }; + var contolChars = new HashMap(); + for (int i = 0; i < contolCharsPairs.length; i += 2) + contolChars.put(Character.valueOf(contolCharsPairs[i]), + Integer.valueOf(contolCharsPairs[i + 1])); + + for (char chP = 0; chP <= 0xff + 16; ++chP) { + String pat = "\\c"; + if (chP < 0xff) { + // \cx with ASCII x + pat = "\\c" + Character.toString(chP); + } else if (chP == 0xff) { + // incomplete \c at the end of pattern + pat = "\\c"; + } else if (chP <= 0xff + 8) { + // \cx with a random non-ASCII char x + int x = 0xff + generator.nextInt(0xff00 + 1); + pat = "\\c" + Character.toString(x); + } else { + // \cx with a random non-ASCII codepoint x + int x = 0xff + generator.nextInt(Character.MAX_CODE_POINT + 1 - 0xff); + pat = "\\c" + Character.toString(x); + } + if (contolChars.containsKey(chP)) { + try { + Pattern p = Pattern.compile(pat); + for (int chS = 0; chS < 0xff; ++chS) { + Matcher m = p.matcher(Character.toString(chS)); + if (m.matches() && contolChars.get(chP) != chS) { + failCount++; + System.out.println("Control character 0x" + Integer.toHexString(chS) + + " unexpectedly matched pattern " + pat); + } else if (!m.matches() && contolChars.get(chP) == chS) { + failCount++; + System.out.println("Control character 0x" + Integer.toHexString(chS) + + " failed to match pattern " + pat); + } + if (m.matches() && Character.getType(chS) != Character.CONTROL) { + failCount++; + System.out.println("Non-control character 0x" + Integer.toHexString(chS) + + " unexpectedly matched pattern " + pat); + } + } + } catch (Throwable t) { + failCount++; + System.out.println("Failed to compile pattern " + pat + + " due to exception: " + t); + } + } else { + try { + Pattern p = Pattern.compile(pat); + failCount++; + System.out.println("Expected to throw an exception when compiling " + pat); + } catch (PatternSyntaxException expected) { + } catch (Throwable t) { + failCount++; + System.out.println("Unexpected exception when compiling " + pat + + " : " + t); + } + } + } + report("controlCharacters"); + } }