nashorn Cdiff src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/runtime/linker/NameCodec.java

src/jdk.scripting.nashorn/share/classes/jdk/nashorn/internal/runtime/linker/NameCodec.java


*** 24,151 ****
   */
  
  package jdk.nashorn.internal.runtime.linker;
  
  /**
   * Implements the name mangling and demangling as specified by John Rose's
   * <a href="https://blogs.oracle.com/jrose/entry/symbolic_freedom_in_the_vm"
   * target="_blank">"Symbolic Freedom in the VM"</a> article. Normally, you would
   * mangle the names in the call sites as you're generating bytecode, and then
   * demangle them when you receive them in bootstrap methods.
   */
  public final class NameCodec {
!     private static final char ESCAPE_CHAR = '\\';
!     private static final char EMPTY_ESCAPE = '=';
      /**
       * Canonical encoding for the empty name.
       */
!     public static final String EMPTY_NAME = new String(new char[] { ESCAPE_CHAR, EMPTY_ESCAPE });
!     private static final char EMPTY_CHAR = 0xFEFF;
! 
!     private static final int MIN_ENCODING = '$';
!     private static final int MAX_ENCODING = ']';
!     private static final char[] ENCODING = new char[MAX_ENCODING - MIN_ENCODING + 1];
!     private static final int MIN_DECODING = '!';
!     private static final int MAX_DECODING = '}';
!     private static final char[] DECODING = new char[MAX_DECODING - MIN_DECODING + 1];
! 
!     static {
!         addEncoding('/', '|');
!         addEncoding('.', ',');
!         addEncoding(';', '?');
!         addEncoding('$', '%');
!         addEncoding('<', '^');
!         addEncoding('>', '_');
!         addEncoding('[', '{');
!         addEncoding(']', '}');
!         addEncoding(':', '!');
!         addEncoding('\\', '-');
!         DECODING[EMPTY_ESCAPE - MIN_DECODING] = EMPTY_CHAR;
!     }
! 
!     private NameCodec() {
!     }
  
      /**
       * Encodes ("mangles") an unencoded symbolic name.
       * @param name the symbolic name to mangle
       * @return the mangled form of the symbolic name.
       */
      public static String encode(final String name) {
!         final int l = name.length();
!         if(l == 0) {
!             return EMPTY_NAME;
!         }
!         StringBuilder b = null;
!         int lastEscape = -1;
!         for(int i = 0; i < l; ++i) {
!             final int encodeIndex = name.charAt(i) - MIN_ENCODING;
!             if(encodeIndex >= 0 && encodeIndex < ENCODING.length) {
!                 final char e = ENCODING[encodeIndex];
!                 if(e != 0) {
!                     if(b == null) {
!                         b = new StringBuilder(name.length() + 3);
!                         if(name.charAt(0) != ESCAPE_CHAR && i > 0) {
!                             b.append(EMPTY_NAME);
                          }
-                         b.append(name, 0, i);
                      } else {
!                         b.append(name, lastEscape + 1, i);
                      }
!                     b.append(ESCAPE_CHAR).append(e);
!                     lastEscape = i;
                  }
              }
          }
!         if(b == null) {
!             return name;
          }
-         assert lastEscape != -1;
-         b.append(name, lastEscape + 1, l);
-         return b.toString();
      }
  
!     /**
!      * Decodes ("demangles") an encoded symbolic name.
!      * @param name the symbolic name to demangle
!      * @return the demangled form of the symbolic name.
!      */
!     public static String decode(final String name) {
!         if(name.isEmpty() || name.charAt(0) != ESCAPE_CHAR) {
!             return name;
          }
!         final int l = name.length();
!         if(l == 2 && name.charAt(1) == EMPTY_CHAR) {
!             return "";
!         }
!         final StringBuilder b = new StringBuilder(name.length());
!         int lastEscape = -2;
!         int lastBackslash = -1;
!         for(;;) {
!             final int nextBackslash = name.indexOf(ESCAPE_CHAR, lastBackslash + 1);
!             if(nextBackslash == -1 || nextBackslash == l - 1) {
!                 break;
!             }
!             final int decodeIndex = name.charAt(nextBackslash + 1) - MIN_DECODING;
!             if(decodeIndex >= 0 && decodeIndex < DECODING.length) {
!                 final char d = DECODING[decodeIndex];
!                 if(d == EMPTY_CHAR) {
!                     // "\=" is only valid at the beginning of a mangled string
!                     if(nextBackslash == 0) {
!                         lastEscape = 0;
!                     }
!                 } else if(d != 0) {
!                     b.append(name, lastEscape + 2, nextBackslash).append(d);
!                     lastEscape = nextBackslash;
!                 }
!             }
!             lastBackslash = nextBackslash;
!         }
!         b.append(name, lastEscape + 2, l);
!         return b.toString();
!     }
! 
!     private static void addEncoding(final char from, final char to) {
!         ENCODING[from - MIN_ENCODING] = to;
!         DECODING[to - MIN_DECODING] = from;
      }
  }
--- 24,432 ----
   */
  
  package jdk.nashorn.internal.runtime.linker;
  
  /**
+  * <p>
   * Implements the name mangling and demangling as specified by John Rose's
   * <a href="https://blogs.oracle.com/jrose/entry/symbolic_freedom_in_the_vm"
   * target="_blank">"Symbolic Freedom in the VM"</a> article. Normally, you would
   * mangle the names in the call sites as you're generating bytecode, and then
   * demangle them when you receive them in bootstrap methods.
+  * </p>
+  * <p>
+  * This code is derived from sun.invoke.util.BytecodeName. Apart from subsetting that
+  * class, we don't want to create dependency between non-exported package from java.base
+  * to nashorn module.
+  * </p>
+  *
+  * <h3>Comment from BytecodeName class reproduced here:</h3>
+  *
+  * Includes universal mangling rules for the JVM.
+  *
+  * <h3>Avoiding Dangerous Characters </h3>
+  *
+  * <p>
+  * The JVM defines a very small set of characters which are illegal
+  * in name spellings.  We will slightly extend and regularize this set
+  * into a group of <cite>dangerous characters</cite>.
+  * These characters will then be replaced, in mangled names, by escape sequences.
+  * In addition, accidental escape sequences must be further escaped.
+  * Finally, a special prefix will be applied if and only if
+  * the mangling would otherwise fail to begin with the escape character.
+  * This happens to cover the corner case of the null string,
+  * and also clearly marks symbols which need demangling.
+  * </p>
+  * <p>
+  * Dangerous characters are the union of all characters forbidden
+  * or otherwise restricted by the JVM specification,
+  * plus their mates, if they are brackets
+  * (<code><big><b>[</b></big></code> and <code><big><b>]</b></big></code>,
+  * <code><big><b>&lt;</b></big></code> and <code><big><b>&gt;</b></big></code>),
+  * plus, arbitrarily, the colon character <code><big><b>:</b></big></code>.
+  * There is no distinction between type, method, and field names.
+  * This makes it easier to convert between mangled names of different
+  * types, since they do not need to be decoded (demangled).
+  * </p>
+  * <p>
+  * The escape character is backslash <code><big><b>\</b></big></code>
+  * (also known as reverse solidus).
+  * This character is, until now, unheard of in bytecode names,
+  * but traditional in the proposed role.
+  *
+  * </p>
+  * <h3> Replacement Characters </h3>
+  *
+  *
+  * <p>
+  * Every escape sequence is two characters
+  * (in fact, two UTF8 bytes) beginning with
+  * the escape character and followed by a
+  * <cite>replacement character</cite>.
+  * (Since the replacement character is never a backslash,
+  * iterated manglings do not double in size.)
+  * </p>
+  * <p>
+  * Each dangerous character has some rough visual similarity
+  * to its corresponding replacement character.
+  * This makes mangled symbols easier to recognize by sight.
+  * </p>
+  * <p>
+  * The dangerous characters are
+  * <code><big><b>/</b></big></code> (forward slash, used to delimit package components),
+  * <code><big><b>.</b></big></code> (dot, also a package delimiter),
+  * <code><big><b>;</b></big></code> (semicolon, used in signatures),
+  * <code><big><b>$</b></big></code> (dollar, used in inner classes and synthetic members),
+  * <code><big><b>&lt;</b></big></code> (left angle),
+  * <code><big><b>&gt;</b></big></code> (right angle),
+  * <code><big><b>[</b></big></code> (left square bracket, used in array types),
+  * <code><big><b>]</b></big></code> (right square bracket, reserved in this scheme for language use),
+  * and <code><big><b>:</b></big></code> (colon, reserved in this scheme for language use).
+  * Their replacements are, respectively,
+  * <code><big><b>|</b></big></code> (vertical bar),
+  * <code><big><b>,</b></big></code> (comma),
+  * <code><big><b>?</b></big></code> (question mark),
+  * <code><big><b>%</b></big></code> (percent),
+  * <code><big><b>^</b></big></code> (caret),
+  * <code><big><b>_</b></big></code> (underscore), and
+  * <code><big><b>{</b></big></code> (left curly bracket),
+  * <code><big><b>}</b></big></code> (right curly bracket),
+  * <code><big><b>!</b></big></code> (exclamation mark).
+  * In addition, the replacement character for the escape character itself is
+  * <code><big><b>-</b></big></code> (hyphen),
+  * and the replacement character for the null prefix is
+  * <code><big><b>=</b></big></code> (equal sign).
+  * </p>
+  * <p>
+  * An escape character <code><big><b>\</b></big></code>
+  * followed by any of these replacement characters
+  * is an escape sequence, and there are no other escape sequences.
+  * An equal sign is only part of an escape sequence
+  * if it is the second character in the whole string, following a backslash.
+  * Two consecutive backslashes do <em>not</em> form an escape sequence.
+  * </p>
+  * <p>
+  * Each escape sequence replaces a so-called <cite>original character</cite>
+  * which is either one of the dangerous characters or the escape character.
+  * A null prefix replaces an initial null string, not a character.
+  * </p>
+  * <p>
+  * All this implies that escape sequences cannot overlap and may be
+  * determined all at once for a whole string.  Note that a spelling
+  * string can contain <cite>accidental escapes</cite>, apparent escape
+  * sequences which must not be interpreted as manglings.
+  * These are disabled by replacing their leading backslash with an
+  * escape sequence (<code><big><b>\-</b></big></code>).  To mangle a string, three logical steps
+  * are required, though they may be carried out in one pass:
+  * </p>
+  * <ol>
+  *   <li>In each accidental escape, replace the backslash with an escape sequence
+  * (<code><big><b>\-</b></big></code>).</li>
+  *   <li>Replace each dangerous character with an escape sequence
+  * (<code><big><b>\|</b></big></code> for <code><big><b>/</b></big></code>, etc.).</li>
+  *   <li>If the first two steps introduced any change, <em>and</em>
+  * if the string does not already begin with a backslash, prepend a null prefix (<code><big><b>\=</b></big></code>).</li>
+  * </ol>
+  *
+  * To demangle a mangled string that begins with an escape,
+  * remove any null prefix, and then replace (in parallel)
+  * each escape sequence by its original character.
+  * <p>Spelling strings which contain accidental
+  * escapes <em>must</em> have them replaced, even if those
+  * strings do not contain dangerous characters.
+  * This restriction means that mangling a string always
+  * requires a scan of the string for escapes.
+  * But then, a scan would be required anyway,
+  * to check for dangerous characters.
+  *
+  * </p>
+  * <h3> Nice Properties </h3>
+  *
+  * <p>
+  * If a bytecode name does not contain any escape sequence,
+  * demangling is a no-op:  The string demangles to itself.
+  * Such a string is called <cite>self-mangling</cite>.
+  * Almost all strings are self-mangling.
+  * In practice, to demangle almost any name &ldquo;found in nature&rdquo;,
+  * simply verify that it does not begin with a backslash.
+  * </p>
+  * <p>
+  * Mangling is a one-to-one function, while demangling
+  * is a many-to-one function.
+  * A mangled string is defined as <cite>validly mangled</cite> if
+  * it is in fact the unique mangling of its spelling string.
+  * Three examples of invalidly mangled strings are <code><big><b>\=foo</b></big></code>,
+  * <code><big><b>\-bar</b></big></code>, and <code><big><b>baz\!</b></big></code>, which demangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and
+  * <code><big><b>baz\!</b></big></code>, but then remangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and <code><big><b>\=baz\-!</b></big></code>.
+  * If a language back-end or runtime is using mangled names,
+  * it should never present an invalidly mangled bytecode
+  * name to the JVM.  If the runtime encounters one,
+  * it should also report an error, since such an occurrence
+  * probably indicates a bug in name encoding which
+  * will lead to errors in linkage.
+  * However, this note does not propose that the JVM verifier
+  * detect invalidly mangled names.
+  * </p>
+  * <p>
+  * As a result of these rules, it is a simple matter to
+  * compute validly mangled substrings and concatenations
+  * of validly mangled strings, and (with a little care)
+  * these correspond to corresponding operations on their
+  * spelling strings.
+  * </p>
+  * <ul>
+  *   <li>Any prefix of a validly mangled string is also validly mangled,
+  * although a null prefix may need to be removed.</li>
+  *   <li>Any suffix of a validly mangled string is also validly mangled,
+  * although a null prefix may need to be added.</li>
+  *   <li>Two validly mangled strings, when concatenated,
+  * are also validly mangled, although any null prefix
+  * must be removed from the second string,
+  * and a trailing backslash on the first string may need escaping,
+  * if it would participate in an accidental escape when followed
+  * by the first character of the second string.</li>
+  * </ul>
+  * <p>If languages that include non-Java symbol spellings use this
+  * mangling convention, they will enjoy the following advantages:
+  * </p>
+  * <ul>
+  *   <li>They can interoperate via symbols they share in common.</li>
+  *   <li>Low-level tools, such as backtrace printers, will have readable displays.</li>
+  *   <li>Future JVM and language extensions can safely use the dangerous characters
+  * for structuring symbols, but will never interfere with valid spellings.</li>
+  *   <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li>
+  *   <li>Occasional transliterations and name composition will be simple and regular,
+  * for classes, methods, and fields.</li>
+  *   <li>Bytecode names will continue to be compact.
+  * When mangled, spellings will at most double in length, either in
+  * UTF8 or UTF16 format, and most will not change at all.</li>
+  * </ul>
+  *
+  *
+  * <h3> Suggestions for Human Readable Presentations </h3>
+  *
+  *
+  * <p>
+  * For human readable displays of symbols,
+  * it will be better to present a string-like quoted
+  * representation of the spelling, because JVM users
+  * are generally familiar with such tokens.
+  * We suggest using single or double quotes before and after
+  * mangled symbols which are not valid Java identifiers,
+  * with quotes, backslashes, and non-printing characters
+  * escaped as if for literals in the Java language.
+  * </p>
+  * <p>
+  * For example, an HTML-like spelling
+  * <code><big><b>&lt;pre&gt;</b></big></code> mangles to
+  * <code><big><b>\^pre\_</b></big></code> and could
+  * display more cleanly as
+  * <code><big><b>'&lt;pre&gt;'</b></big></code>,
+  * with the quotes included.
+  * Such string-like conventions are <em>not</em> suitable
+  * for mangled bytecode names, in part because
+  * dangerous characters must be eliminated, rather
+  * than just quoted.  Otherwise internally structured
+  * strings like package prefixes and method signatures
+  * could not be reliably parsed.
+  * </p>
+  * <p>
+  * In such human-readable displays, invalidly mangled
+  * names should <em>not</em> be demangled and quoted,
+  * for this would be misleading.  Likewise, JVM symbols
+  * which contain dangerous characters (like dots in field
+  * names or brackets in method names) should not be
+  * simply quoted.  The bytecode names
+  * <code><big><b>\=phase\,1</b></big></code> and
+  * <code><big><b>phase.1</b></big></code> are distinct,
+  * and in demangled displays they should be presented as
+  * <code><big><b>'phase.1'</b></big></code> and something like
+  * <code><big><b>'phase'.1</b></big></code>, respectively.
+  * </p>
   */
  public final class NameCodec {
!     private NameCodec() {
!     }
! 
!     private static final char ESCAPE_C = '\\';
!     // empty escape sequence to avoid a null name or illegal prefix
!     private static final char NULL_ESCAPE_C = '=';
!     private static final String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;
! 
      /**
       * Canonical encoding for the empty name.
       */
!     public static final String EMPTY_NAME =  new String(new char[] { ESCAPE_C, NULL_ESCAPE_C });
  
      /**
       * Encodes ("mangles") an unencoded symbolic name.
       * @param name the symbolic name to mangle
       * @return the mangled form of the symbolic name.
       */
      public static String encode(final String name) {
!         String bn = mangle(name);
!         assert((Object)bn == name || looksMangled(bn)) : bn;
!         assert(name.equals(decode(bn))) : name;
!         return bn;
!     }
! 
!     /**
!      * Decodes ("demangles") an encoded symbolic name.
!      * @param name the symbolic name to demangle
!      * @return the demangled form of the symbolic name.
!      */
!     public static String decode(final String name) {
!         String sn = name;
!         if (!sn.isEmpty() && looksMangled(name)) {
!             sn = demangle(name);
!             assert(name.equals(mangle(sn))) : name+" => "+sn+" => "+mangle(sn);
!         }
!         return sn;
!     }
! 
!     private static boolean looksMangled(String s) {
!         return s.charAt(0) == ESCAPE_C;
!     }
! 
!     private static String mangle(String s) {
!         if (s.length() == 0)
!             return NULL_ESCAPE;
! 
!         // build this lazily, when we first need an escape:
!         StringBuilder sb = null;
! 
!         for (int i = 0, slen = s.length(); i < slen; i++) {
!             char c = s.charAt(i);
! 
!             boolean needEscape = false;
!             if (c == ESCAPE_C) {
!                 if (i+1 < slen) {
!                     char c1 = s.charAt(i+1);
!                     if ((i == 0 && c1 == NULL_ESCAPE_C)
!                         || c1 != originalOfReplacement(c1)) {
!                         // an accidental escape
!                         needEscape = true;
!                     }
                  }
              } else {
!                 needEscape = isDangerous(c);
!             }
! 
!             if (!needEscape) {
!                 if (sb != null)  sb.append(c);
!                 continue;
              }
! 
!             // build sb if this is the first escape
!             if (sb == null) {
!                 sb = new StringBuilder(s.length()+10);
!                 // mangled names must begin with a backslash:
!                 if (s.charAt(0) != ESCAPE_C && i > 0)
!                     sb.append(NULL_ESCAPE);
!                 // append the string so far, which is unremarkable:
!                 sb.append(s, 0, i);
              }
+ 
+             // rewrite \ to \-, / to \|, etc.
+             sb.append(ESCAPE_C);
+             sb.append(replacementOf(c));
          }
+ 
+         if (sb != null)   return sb.toString();
+ 
+         return s;
      }
! 
!     private static String demangle(String s) {
!         // build this lazily, when we first meet an escape:
!         StringBuilder sb = null;
! 
!         int stringStart = 0;
!         if (s.startsWith(NULL_ESCAPE))
!             stringStart = 2;
! 
!         for (int i = stringStart, slen = s.length(); i < slen; i++) {
!             char c = s.charAt(i);
! 
!             if (c == ESCAPE_C && i+1 < slen) {
!                 // might be an escape sequence
!                 char rc = s.charAt(i+1);
!                 char oc = originalOfReplacement(rc);
!                 if (oc != rc) {
!                     // build sb if this is the first escape
!                     if (sb == null) {
!                         sb = new StringBuilder(s.length());
!                         // append the string so far, which is unremarkable:
!                         sb.append(s, stringStart, i);
!                     }
!                     ++i;  // skip both characters
!                     c = oc;
                  }
              }
  
!             if (sb != null)
!                 sb.append(c);
          }
! 
!         if (sb != null)   return sb.toString();
! 
!         return s.substring(stringStart);
!     }
! 
!     private static final String DANGEROUS_CHARS   = "\\/.;:$[]<>"; // \\ must be first
!     private static final String REPLACEMENT_CHARS =  "-|,?!%{}^_";
!     private static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
! 
!     private static final long[] SPECIAL_BITMAP = new long[2];  // 128 bits
!     static {
!         String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
!         for (char c : SPECIAL.toCharArray()) {
!             SPECIAL_BITMAP[c >>> 6] |= 1L << c;
!         }
!     }
! 
!     private static boolean isSpecial(char c) {
!         if ((c >>> 6) < SPECIAL_BITMAP.length)
!             return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
!         else
!             return false;
!     }
! 
!     private static char replacementOf(char c) {
!         if (!isSpecial(c))  return c;
!         int i = DANGEROUS_CHARS.indexOf(c);
!         if (i < 0)  return c;
!         return REPLACEMENT_CHARS.charAt(i);
!     }
! 
!     private static char originalOfReplacement(char c) {
!         if (!isSpecial(c))  return c;
!         int i = REPLACEMENT_CHARS.indexOf(c);
!         if (i < 0)  return c;
!         return DANGEROUS_CHARS.charAt(i);
!     }
! 
!     private static boolean isDangerous(char c) {
!         if (!isSpecial(c))  return false;
!         return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
      }
  }
< prev index next >