--- old/src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/runtime/linker/NameCodec.java 2015-11-03 19:54:42.095873500 +0530 +++ new/src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/runtime/linker/NameCodec.java 2015-11-03 19:54:41.410858300 +0530 @@ -26,44 +26,256 @@ package jdk.nashorn.internal.runtime.linker; /** + *

* Implements the name mangling and demangling as specified by John Rose's * "Symbolic Freedom in the VM" article. Normally, you would * mangle the names in the call sites as you're generating bytecode, and then * demangle them when you receive them in bootstrap methods. + *

+ *

+ * This code is derived from sun.invoke.util.BytecodeName. Apart from subsetting that + * class, we don't want to create dependency between non-exported package from java.base + * to nashorn module. + *

+ * + *

Comment from BytecodeName class reproduced here:

+ * + * Includes universal mangling rules for the JVM. + * + *

Avoiding Dangerous Characters

+ * + *

+ * The JVM defines a very small set of characters which are illegal + * in name spellings. We will slightly extend and regularize this set + * into a group of dangerous characters. + * These characters will then be replaced, in mangled names, by escape sequences. + * In addition, accidental escape sequences must be further escaped. + * Finally, a special prefix will be applied if and only if + * the mangling would otherwise fail to begin with the escape character. + * This happens to cover the corner case of the null string, + * and also clearly marks symbols which need demangling. + *

+ *

+ * Dangerous characters are the union of all characters forbidden + * or otherwise restricted by the JVM specification, + * plus their mates, if they are brackets + * ([ and ], + * < and >), + * plus, arbitrarily, the colon character :. + * There is no distinction between type, method, and field names. + * This makes it easier to convert between mangled names of different + * types, since they do not need to be decoded (demangled). + *

+ *

+ * The escape character is backslash \ + * (also known as reverse solidus). + * This character is, until now, unheard of in bytecode names, + * but traditional in the proposed role. + * + *

+ *

Replacement Characters

+ * + * + *

+ * Every escape sequence is two characters + * (in fact, two UTF8 bytes) beginning with + * the escape character and followed by a + * replacement character. + * (Since the replacement character is never a backslash, + * iterated manglings do not double in size.) + *

+ *

+ * Each dangerous character has some rough visual similarity + * to its corresponding replacement character. + * This makes mangled symbols easier to recognize by sight. + *

+ *

+ * The dangerous characters are + * / (forward slash, used to delimit package components), + * . (dot, also a package delimiter), + * ; (semicolon, used in signatures), + * $ (dollar, used in inner classes and synthetic members), + * < (left angle), + * > (right angle), + * [ (left square bracket, used in array types), + * ] (right square bracket, reserved in this scheme for language use), + * and : (colon, reserved in this scheme for language use). + * Their replacements are, respectively, + * | (vertical bar), + * , (comma), + * ? (question mark), + * % (percent), + * ^ (caret), + * _ (underscore), and + * { (left curly bracket), + * } (right curly bracket), + * ! (exclamation mark). + * In addition, the replacement character for the escape character itself is + * - (hyphen), + * and the replacement character for the null prefix is + * = (equal sign). + *

+ *

+ * An escape character \ + * followed by any of these replacement characters + * is an escape sequence, and there are no other escape sequences. + * An equal sign is only part of an escape sequence + * if it is the second character in the whole string, following a backslash. + * Two consecutive backslashes do not form an escape sequence. + *

+ *

+ * Each escape sequence replaces a so-called original character + * which is either one of the dangerous characters or the escape character. + * A null prefix replaces an initial null string, not a character. + *

+ *

+ * All this implies that escape sequences cannot overlap and may be + * determined all at once for a whole string. Note that a spelling + * string can contain accidental escapes, apparent escape + * sequences which must not be interpreted as manglings. + * These are disabled by replacing their leading backslash with an + * escape sequence (\-). To mangle a string, three logical steps + * are required, though they may be carried out in one pass: + *

+ *
    + *
  1. In each accidental escape, replace the backslash with an escape sequence + * (\-).
  2. + *
  3. Replace each dangerous character with an escape sequence + * (\| for /, etc.).
  4. + *
  5. If the first two steps introduced any change, and + * if the string does not already begin with a backslash, prepend a null prefix (\=).
  6. + *
+ * + * To demangle a mangled string that begins with an escape, + * remove any null prefix, and then replace (in parallel) + * each escape sequence by its original character. + *

Spelling strings which contain accidental + * escapes must have them replaced, even if those + * strings do not contain dangerous characters. + * This restriction means that mangling a string always + * requires a scan of the string for escapes. + * But then, a scan would be required anyway, + * to check for dangerous characters. + * + *

+ *

Nice Properties

+ * + *

+ * If a bytecode name does not contain any escape sequence, + * demangling is a no-op: The string demangles to itself. + * Such a string is called self-mangling. + * Almost all strings are self-mangling. + * In practice, to demangle almost any name “found in nature”, + * simply verify that it does not begin with a backslash. + *

+ *

+ * Mangling is a one-to-one function, while demangling + * is a many-to-one function. + * A mangled string is defined as validly mangled if + * it is in fact the unique mangling of its spelling string. + * Three examples of invalidly mangled strings are \=foo, + * \-bar, and baz\!, which demangle to foo, \bar, and + * baz\!, but then remangle to foo, \bar, and \=baz\-!. + * If a language back-end or runtime is using mangled names, + * it should never present an invalidly mangled bytecode + * name to the JVM. If the runtime encounters one, + * it should also report an error, since such an occurrence + * probably indicates a bug in name encoding which + * will lead to errors in linkage. + * However, this note does not propose that the JVM verifier + * detect invalidly mangled names. + *

+ *

+ * As a result of these rules, it is a simple matter to + * compute validly mangled substrings and concatenations + * of validly mangled strings, and (with a little care) + * these correspond to corresponding operations on their + * spelling strings. + *

+ * + *

If languages that include non-Java symbol spellings use this + * mangling convention, they will enjoy the following advantages: + *

+ * + * + * + *

Suggestions for Human Readable Presentations

+ * + * + *

+ * For human readable displays of symbols, + * it will be better to present a string-like quoted + * representation of the spelling, because JVM users + * are generally familiar with such tokens. + * We suggest using single or double quotes before and after + * mangled symbols which are not valid Java identifiers, + * with quotes, backslashes, and non-printing characters + * escaped as if for literals in the Java language. + *

+ *

+ * For example, an HTML-like spelling + * <pre> mangles to + * \^pre\_ and could + * display more cleanly as + * '<pre>', + * with the quotes included. + * Such string-like conventions are not suitable + * for mangled bytecode names, in part because + * dangerous characters must be eliminated, rather + * than just quoted. Otherwise internally structured + * strings like package prefixes and method signatures + * could not be reliably parsed. + *

+ *

+ * In such human-readable displays, invalidly mangled + * names should not be demangled and quoted, + * for this would be misleading. Likewise, JVM symbols + * which contain dangerous characters (like dots in field + * names or brackets in method names) should not be + * simply quoted. The bytecode names + * \=phase\,1 and + * phase.1 are distinct, + * and in demangled displays they should be presented as + * 'phase.1' and something like + * 'phase'.1, respectively. + *

*/ public final class NameCodec { - private static final char ESCAPE_CHAR = '\\'; - private static final char EMPTY_ESCAPE = '='; + private NameCodec() { + } + + private static final char ESCAPE_C = '\\'; + // empty escape sequence to avoid a null name or illegal prefix + private static final char NULL_ESCAPE_C = '='; + private static final String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C; + /** * Canonical encoding for the empty name. */ - public static final String EMPTY_NAME = new String(new char[] { ESCAPE_CHAR, EMPTY_ESCAPE }); - private static final char EMPTY_CHAR = 0xFEFF; - - private static final int MIN_ENCODING = '$'; - private static final int MAX_ENCODING = ']'; - private static final char[] ENCODING = new char[MAX_ENCODING - MIN_ENCODING + 1]; - private static final int MIN_DECODING = '!'; - private static final int MAX_DECODING = '}'; - private static final char[] DECODING = new char[MAX_DECODING - MIN_DECODING + 1]; - - static { - addEncoding('/', '|'); - addEncoding('.', ','); - addEncoding(';', '?'); - addEncoding('$', '%'); - addEncoding('<', '^'); - addEncoding('>', '_'); - addEncoding('[', '{'); - addEncoding(']', '}'); - addEncoding(':', '!'); - addEncoding('\\', '-'); - DECODING[EMPTY_ESCAPE - MIN_DECODING] = EMPTY_CHAR; - } - - private NameCodec() { - } + public static final String EMPTY_NAME = new String(new char[] { ESCAPE_C, NULL_ESCAPE_C }); /** * Encodes ("mangles") an unencoded symbolic name. @@ -71,37 +283,10 @@ * @return the mangled form of the symbolic name. */ public static String encode(final String name) { - final int l = name.length(); - if(l == 0) { - return EMPTY_NAME; - } - StringBuilder b = null; - int lastEscape = -1; - for(int i = 0; i < l; ++i) { - final int encodeIndex = name.charAt(i) - MIN_ENCODING; - if(encodeIndex >= 0 && encodeIndex < ENCODING.length) { - final char e = ENCODING[encodeIndex]; - if(e != 0) { - if(b == null) { - b = new StringBuilder(name.length() + 3); - if(name.charAt(0) != ESCAPE_CHAR && i > 0) { - b.append(EMPTY_NAME); - } - b.append(name, 0, i); - } else { - b.append(name, lastEscape + 1, i); - } - b.append(ESCAPE_CHAR).append(e); - lastEscape = i; - } - } - } - if(b == null) { - return name; - } - assert lastEscape != -1; - b.append(name, lastEscape + 1, l); - return b.toString(); + String bn = mangle(name); + assert((Object)bn == name || looksMangled(bn)) : bn; + assert(name.equals(decode(bn))) : name; + return bn; } /** @@ -110,42 +295,138 @@ * @return the demangled form of the symbolic name. */ public static String decode(final String name) { - if(name.isEmpty() || name.charAt(0) != ESCAPE_CHAR) { - return name; - } - final int l = name.length(); - if(l == 2 && name.charAt(1) == EMPTY_CHAR) { - return ""; + String sn = name; + if (!sn.isEmpty() && looksMangled(name)) { + sn = demangle(name); + assert(name.equals(mangle(sn))) : name+" => "+sn+" => "+mangle(sn); } - final StringBuilder b = new StringBuilder(name.length()); - int lastEscape = -2; - int lastBackslash = -1; - for(;;) { - final int nextBackslash = name.indexOf(ESCAPE_CHAR, lastBackslash + 1); - if(nextBackslash == -1 || nextBackslash == l - 1) { - break; + return sn; + } + + private static boolean looksMangled(String s) { + return s.charAt(0) == ESCAPE_C; + } + + private static String mangle(String s) { + if (s.length() == 0) + return NULL_ESCAPE; + + // build this lazily, when we first need an escape: + StringBuilder sb = null; + + for (int i = 0, slen = s.length(); i < slen; i++) { + char c = s.charAt(i); + + boolean needEscape = false; + if (c == ESCAPE_C) { + if (i+1 < slen) { + char c1 = s.charAt(i+1); + if ((i == 0 && c1 == NULL_ESCAPE_C) + || c1 != originalOfReplacement(c1)) { + // an accidental escape + needEscape = true; + } + } + } else { + needEscape = isDangerous(c); } - final int decodeIndex = name.charAt(nextBackslash + 1) - MIN_DECODING; - if(decodeIndex >= 0 && decodeIndex < DECODING.length) { - final char d = DECODING[decodeIndex]; - if(d == EMPTY_CHAR) { - // "\=" is only valid at the beginning of a mangled string - if(nextBackslash == 0) { - lastEscape = 0; + + if (!needEscape) { + if (sb != null) sb.append(c); + continue; + } + + // build sb if this is the first escape + if (sb == null) { + sb = new StringBuilder(s.length()+10); + // mangled names must begin with a backslash: + if (s.charAt(0) != ESCAPE_C && i > 0) + sb.append(NULL_ESCAPE); + // append the string so far, which is unremarkable: + sb.append(s, 0, i); + } + + // rewrite \ to \-, / to \|, etc. + sb.append(ESCAPE_C); + sb.append(replacementOf(c)); + } + + if (sb != null) return sb.toString(); + + return s; + } + + private static String demangle(String s) { + // build this lazily, when we first meet an escape: + StringBuilder sb = null; + + int stringStart = 0; + if (s.startsWith(NULL_ESCAPE)) + stringStart = 2; + + for (int i = stringStart, slen = s.length(); i < slen; i++) { + char c = s.charAt(i); + + if (c == ESCAPE_C && i+1 < slen) { + // might be an escape sequence + char rc = s.charAt(i+1); + char oc = originalOfReplacement(rc); + if (oc != rc) { + // build sb if this is the first escape + if (sb == null) { + sb = new StringBuilder(s.length()); + // append the string so far, which is unremarkable: + sb.append(s, stringStart, i); } - } else if(d != 0) { - b.append(name, lastEscape + 2, nextBackslash).append(d); - lastEscape = nextBackslash; + ++i; // skip both characters + c = oc; } } - lastBackslash = nextBackslash; + + if (sb != null) + sb.append(c); } - b.append(name, lastEscape + 2, l); - return b.toString(); + + if (sb != null) return sb.toString(); + + return s.substring(stringStart); + } + + private static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first + private static final String REPLACEMENT_CHARS = "-|,?!%{}^_"; + private static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\ + + private static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits + static { + String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS; + for (char c : SPECIAL.toCharArray()) { + SPECIAL_BITMAP[c >>> 6] |= 1L << c; + } + } + + private static boolean isSpecial(char c) { + if ((c >>> 6) < SPECIAL_BITMAP.length) + return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0; + else + return false; + } + + private static char replacementOf(char c) { + if (!isSpecial(c)) return c; + int i = DANGEROUS_CHARS.indexOf(c); + if (i < 0) return c; + return REPLACEMENT_CHARS.charAt(i); + } + + private static char originalOfReplacement(char c) { + if (!isSpecial(c)) return c; + int i = REPLACEMENT_CHARS.indexOf(c); + if (i < 0) return c; + return DANGEROUS_CHARS.charAt(i); } - private static void addEncoding(final char from, final char to) { - ENCODING[from - MIN_ENCODING] = to; - DECODING[to - MIN_DECODING] = from; + private static boolean isDangerous(char c) { + if (!isSpecial(c)) return false; + return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX); } } Binary files old/samples/Main.class and /dev/null differ --- /dev/null 2015-11-03 19:54:47.000000000 +0530 +++ new/samples/find_underscores.js 2015-11-03 19:54:46.623904200 +0530 @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of Oracle nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Usage: jjs find_underscores.js -- + +if (arguments.length == 0) { + print("Usage: jjs find_underscores.js -- "); + exit(1); +} + +// Java types used +var File = Java.type("java.io.File"); +var Files = Java.type("java.nio.file.Files"); +var StringArray = Java.type("java.lang.String[]"); +var ToolProvider = Java.type("javax.tools.ToolProvider"); +var Tree = Java.type("com.sun.source.tree.Tree"); +var Trees = Java.type("com.sun.source.util.Trees"); +var TreeScanner = Java.type("com.sun.source.util.TreeScanner"); + +function findUnderscores() { + // get the system compiler tool + var compiler = ToolProvider.systemJavaCompiler; + // get standard file manager + var fileMgr = compiler.getStandardFileManager(null, null, null); + // Using Java.to convert script array (arguments) to a Java String[] + var compUnits = fileMgr.getJavaFileObjects(Java.to(arguments, StringArray)); + // create a new compilation task + var task = compiler.getTask(null, fileMgr, null, null, null, compUnits); + var sourcePositions = Trees.instance(task).sourcePositions; + // subclass SimpleTreeVisitor - to find underscore variable names + var UnderscoreFinder = Java.extend(TreeScanner); + + var visitor = new UnderscoreFinder() { + // override to capture information on current compilation unit + visitCompilationUnit: function(compUnit, p) { + this.compUnit = compUnit; + this.lineMap = compUnit.lineMap; + this.fileName = compUnit.sourceFile.name; + + return Java.super(visitor).visitCompilationUnit(compUnit, p); + }, + + // override to check variable name + visitVariable: function(node, p) { + if (node.name.toString() == "_") { + var pos = sourcePositions.getStartPosition(this.compUnit, node); + var line = this.lineMap.getLineNumber(pos); + var col = this.lineMap.getColumnNumber(pos); + print(node + " @ " + this.fileName + ":" + line + ":" + col); + } + + return Java.super(visitor).visitVariable(node, p); + } + } + + for each (var cu in task.parse()) { + cu.accept(visitor, null); + } +} + +// for each ".java" file in directory (recursively). +function main(dir) { + var totalCount = 0; + Files.walk(dir.toPath()). + forEach(function(p) { + var name = p.toFile().absolutePath; + if (name.endsWith(".java")) { + findUnderscores(p); + } + }); +} + +main(new File(arguments[0])); --- /dev/null 2015-11-03 19:54:51.000000000 +0530 +++ new/test/src/jdk/nashorn/internal/runtime/linker/test/NameCodecTest.java 2015-11-03 19:54:49.571950100 +0530 @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package jdk.nashorn.internal.runtime.linker.test; + +import static org.testng.Assert.assertEquals; + +import jdk.nashorn.internal.runtime.linker.NameCodec; +import org.testng.annotations.Test; + +/** + * Test for jdk.nashorn.intenal.runtime.linker.NameCodec.java. This test is + * derived from BytecodeNameTest.java from (older) mlvm code @ + * http://hg.openjdk.java.net/mlvm/mlvm/file/tip/netbeans/meth/test/sun/invoke/util/BytecodeNameTest.java + * + * @bug 8141285: NameCode should pass tests from BytecodeNameTest.java + */ +public class NameCodecTest { + + static String[][] SAMPLES = { + // mangled, source + {"foo", "foo"}, + {"ba\\r", "ba\\r"}, + {"\\=ba\\-%z", "ba\\%z"}, + {"\\=ba\\--z", "ba\\-z"}, + {"=\\=", "=\\="}, + {"\\==\\|\\=", "=/\\="}, + {"\\|\\=", "/\\="}, + {"\\=ba\\!", "ba:"}, + {"\\|", "/"}, + {"\\", "\\"}, + {"\\\\%", "\\$"}, + {"\\\\", "\\\\"}, + {"\\=", ""} + + }; + + static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; + static final String REPLACEMENT_CHARS = "-|,?!%{}^_"; + + static String[][] canonicalSamples() { + int ndc = DANGEROUS_CHARS.length(); + String[][] res = new String[2 * ndc][]; + for (int i = 0; i < ndc; i++) { + char dc = DANGEROUS_CHARS.charAt(i); + char rc = REPLACEMENT_CHARS.charAt(i); + if (dc == '\\') { + res[2 * i + 0] = new String[]{"\\-%", "\\%"}; + } else { + res[2 * i + 0] = new String[]{"\\" + rc, "" + dc}; + } + res[2 * i + 1] = new String[]{"" + rc, "" + rc}; + } + return res; + } + + @Test + public void testEncode() { + System.out.println("testEncode"); + testEncode(SAMPLES); + testEncode(canonicalSamples()); + } + + private void testEncode(String[][] samples) { + for (String[] sample : samples) { + String s = sample[1]; + String expResult = sample[0]; + String result = NameCodec.encode(s); + if (!result.equals(expResult)) { + System.out.println(s + " => " + result + " != " + expResult); + } + assertEquals(expResult, result); + } + } + + @Test + public void testDecode() { + System.out.println("testDecode"); + testDecode(SAMPLES); + testDecode(canonicalSamples()); + } + + private void testDecode(String[][] samples) { + for (String[] sample : samples) { + String s = sample[0]; + String expResult = sample[1]; + String result = NameCodec.decode(s); + assertEquals(expResult, result); + } + } +}