1 /*
   2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.nashorn.internal.runtime.linker;
  27 
  28 /**
  29  * <p>
  30  * Implements the name mangling and demangling as specified by John Rose's
  31  * <a href="https://blogs.oracle.com/jrose/entry/symbolic_freedom_in_the_vm"
  32  * target="_blank">"Symbolic Freedom in the VM"</a> article. Normally, you would
  33  * mangle the names in the call sites as you're generating bytecode, and then
  34  * demangle them when you receive them in bootstrap methods.
  35  * </p>
  36  * <p>
  37  * This code is derived from sun.invoke.util.BytecodeName. Apart from subsetting that
  38  * class, we don't want to create dependency between non-exported package from java.base
  39  * to nashorn module.
  40  * </p>
  41  *
  42  * <h2>Comment from BytecodeName class reproduced here:</h2>
  43  *
  44  * Includes universal mangling rules for the JVM.
  45  *
  46  * <h2>Avoiding Dangerous Characters </h2>
  47  *
  48  * <p>
  49  * The JVM defines a very small set of characters which are illegal
  50  * in name spellings.  We will slightly extend and regularize this set
  51  * into a group of <cite>dangerous characters</cite>.
  52  * These characters will then be replaced, in mangled names, by escape sequences.
  53  * In addition, accidental escape sequences must be further escaped.
  54  * Finally, a special prefix will be applied if and only if
  55  * the mangling would otherwise fail to begin with the escape character.
  56  * This happens to cover the corner case of the null string,
  57  * and also clearly marks symbols which need demangling.
  58  * </p>
  59  * <p>
  60  * Dangerous characters are the union of all characters forbidden
  61  * or otherwise restricted by the JVM specification,
  62  * plus their mates, if they are brackets
  63  * (<code><b>[</b></code> and <code><b>]</b></code>,
  64  * <code><b>&lt;</b></code> and <code><b>&gt;</b></code>),
  65  * plus, arbitrarily, the colon character <code><b>:</b></code>.
  66  * There is no distinction between type, method, and field names.
  67  * This makes it easier to convert between mangled names of different
  68  * types, since they do not need to be decoded (demangled).
  69  * </p>
  70  * <p>
  71  * The escape character is backslash <code><b>\</b></code>
  72  * (also known as reverse solidus).
  73  * This character is, until now, unheard of in bytecode names,
  74  * but traditional in the proposed role.
  75  *
  76  * </p>
  77  * <h2> Replacement Characters </h2>
  78  *
  79  *
  80  * <p>
  81  * Every escape sequence is two characters
  82  * (in fact, two UTF8 bytes) beginning with
  83  * the escape character and followed by a
  84  * <cite>replacement character</cite>.
  85  * (Since the replacement character is never a backslash,
  86  * iterated manglings do not double in size.)
  87  * </p>
  88  * <p>
  89  * Each dangerous character has some rough visual similarity
  90  * to its corresponding replacement character.
  91  * This makes mangled symbols easier to recognize by sight.
  92  * </p>
  93  * <p>
  94  * The dangerous characters are
  95  * <code><b>/</b></code> (forward slash, used to delimit package components),
  96  * <code><b>.</b></code> (dot, also a package delimiter),
  97  * <code><b>;</b></code> (semicolon, used in signatures),
  98  * <code><b>$</b></code> (dollar, used in inner classes and synthetic members),
  99  * <code><b>&lt;</b></code> (left angle),
 100  * <code><b>&gt;</b></code> (right angle),
 101  * <code><b>[</b></code> (left square bracket, used in array types),
 102  * <code><b>]</b></code> (right square bracket, reserved in this scheme for language use),
 103  * and <code><b>:</b></code> (colon, reserved in this scheme for language use).
 104  * Their replacements are, respectively,
 105  * <code><b>|</b></code> (vertical bar),
 106  * <code><b>,</b></code> (comma),
 107  * <code><b>?</b></code> (question mark),
 108  * <code><b>%</b></code> (percent),
 109  * <code><b>^</b></code> (caret),
 110  * <code><b>_</b></code> (underscore), and
 111  * <code><b>{</b></code> (left curly bracket),
 112  * <code><b>}</b></code> (right curly bracket),
 113  * <code><b>!</b></code> (exclamation mark).
 114  * In addition, the replacement character for the escape character itself is
 115  * <code><b>-</b></code> (hyphen),
 116  * and the replacement character for the null prefix is
 117  * <code><b>=</b></code> (equal sign).
 118  * </p>
 119  * <p>
 120  * An escape character <code><b>\</b></code>
 121  * followed by any of these replacement characters
 122  * is an escape sequence, and there are no other escape sequences.
 123  * An equal sign is only part of an escape sequence
 124  * if it is the second character in the whole string, following a backslash.
 125  * Two consecutive backslashes do <em>not</em> form an escape sequence.
 126  * </p>
 127  * <p>
 128  * Each escape sequence replaces a so-called <cite>original character</cite>
 129  * which is either one of the dangerous characters or the escape character.
 130  * A null prefix replaces an initial null string, not a character.
 131  * </p>
 132  * <p>
 133  * All this implies that escape sequences cannot overlap and may be
 134  * determined all at once for a whole string.  Note that a spelling
 135  * string can contain <cite>accidental escapes</cite>, apparent escape
 136  * sequences which must not be interpreted as manglings.
 137  * These are disabled by replacing their leading backslash with an
 138  * escape sequence (<code><b>\-</b></code>).  To mangle a string, three logical steps
 139  * are required, though they may be carried out in one pass:
 140  * </p>
 141  * <ol>
 142  *   <li>In each accidental escape, replace the backslash with an escape sequence
 143  * (<code><b>\-</b></code>).</li>
 144  *   <li>Replace each dangerous character with an escape sequence
 145  * (<code><b>\|</b></code> for <code><b>/</b></code>, etc.).</li>
 146  *   <li>If the first two steps introduced any change, <em>and</em>
 147  * if the string does not already begin with a backslash, prepend a null prefix (<code><b>\=</b></code>).</li>
 148  * </ol>
 149  *
 150  * To demangle a mangled string that begins with an escape,
 151  * remove any null prefix, and then replace (in parallel)
 152  * each escape sequence by its original character.
 153  * <p>Spelling strings which contain accidental
 154  * escapes <em>must</em> have them replaced, even if those
 155  * strings do not contain dangerous characters.
 156  * This restriction means that mangling a string always
 157  * requires a scan of the string for escapes.
 158  * But then, a scan would be required anyway,
 159  * to check for dangerous characters.
 160  *
 161  * </p>
 162  * <h2> Nice Properties </h2>
 163  *
 164  * <p>
 165  * If a bytecode name does not contain any escape sequence,
 166  * demangling is a no-op:  The string demangles to itself.
 167  * Such a string is called <cite>self-mangling</cite>.
 168  * Almost all strings are self-mangling.
 169  * In practice, to demangle almost any name &ldquo;found in nature&rdquo;,
 170  * simply verify that it does not begin with a backslash.
 171  * </p>
 172  * <p>
 173  * Mangling is a one-to-one function, while demangling
 174  * is a many-to-one function.
 175  * A mangled string is defined as <cite>validly mangled</cite> if
 176  * it is in fact the unique mangling of its spelling string.
 177  * Three examples of invalidly mangled strings are <code><b>\=foo</b></code>,
 178  * <code><b>\-bar</b></code>, and <code><b>baz\!</b></code>, which demangle to <code><b>foo</b></code>, <code><b>\bar</b></code>, and
 179  * <code><b>baz\!</b></code>, but then remangle to <code><b>foo</b></code>, <code><b>\bar</b></code>, and <code><b>\=baz\-!</b></code>.
 180  * If a language back-end or runtime is using mangled names,
 181  * it should never present an invalidly mangled bytecode
 182  * name to the JVM.  If the runtime encounters one,
 183  * it should also report an error, since such an occurrence
 184  * probably indicates a bug in name encoding which
 185  * will lead to errors in linkage.
 186  * However, this note does not propose that the JVM verifier
 187  * detect invalidly mangled names.
 188  * </p>
 189  * <p>
 190  * As a result of these rules, it is a simple matter to
 191  * compute validly mangled substrings and concatenations
 192  * of validly mangled strings, and (with a little care)
 193  * these correspond to corresponding operations on their
 194  * spelling strings.
 195  * </p>
 196  * <ul>
 197  *   <li>Any prefix of a validly mangled string is also validly mangled,
 198  * although a null prefix may need to be removed.</li>
 199  *   <li>Any suffix of a validly mangled string is also validly mangled,
 200  * although a null prefix may need to be added.</li>
 201  *   <li>Two validly mangled strings, when concatenated,
 202  * are also validly mangled, although any null prefix
 203  * must be removed from the second string,
 204  * and a trailing backslash on the first string may need escaping,
 205  * if it would participate in an accidental escape when followed
 206  * by the first character of the second string.</li>
 207  * </ul>
 208  * <p>If languages that include non-Java symbol spellings use this
 209  * mangling convention, they will enjoy the following advantages:
 210  * </p>
 211  * <ul>
 212  *   <li>They can interoperate via symbols they share in common.</li>
 213  *   <li>Low-level tools, such as backtrace printers, will have readable displays.</li>
 214  *   <li>Future JVM and language extensions can safely use the dangerous characters
 215  * for structuring symbols, but will never interfere with valid spellings.</li>
 216  *   <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li>
 217  *   <li>Occasional transliterations and name composition will be simple and regular,
 218  * for classes, methods, and fields.</li>
 219  *   <li>Bytecode names will continue to be compact.
 220  * When mangled, spellings will at most double in length, either in
 221  * UTF8 or UTF16 format, and most will not change at all.</li>
 222  * </ul>
 223  *
 224  *
 225  * <h2> Suggestions for Human Readable Presentations </h2>
 226  *
 227  *
 228  * <p>
 229  * For human readable displays of symbols,
 230  * it will be better to present a string-like quoted
 231  * representation of the spelling, because JVM users
 232  * are generally familiar with such tokens.
 233  * We suggest using single or double quotes before and after
 234  * mangled symbols which are not valid Java identifiers,
 235  * with quotes, backslashes, and non-printing characters
 236  * escaped as if for literals in the Java language.
 237  * </p>
 238  * <p>
 239  * For example, an HTML-like spelling
 240  * <code><b>&lt;pre&gt;</b></code> mangles to
 241  * <code><b>\^pre\_</b></code> and could
 242  * display more cleanly as
 243  * <code><b>'&lt;pre&gt;'</b></code>,
 244  * with the quotes included.
 245  * Such string-like conventions are <em>not</em> suitable
 246  * for mangled bytecode names, in part because
 247  * dangerous characters must be eliminated, rather
 248  * than just quoted.  Otherwise internally structured
 249  * strings like package prefixes and method signatures
 250  * could not be reliably parsed.
 251  * </p>
 252  * <p>
 253  * In such human-readable displays, invalidly mangled
 254  * names should <em>not</em> be demangled and quoted,
 255  * for this would be misleading.  Likewise, JVM symbols
 256  * which contain dangerous characters (like dots in field
 257  * names or brackets in method names) should not be
 258  * simply quoted.  The bytecode names
 259  * <code><b>\=phase\,1</b></code> and
 260  * <code><b>phase.1</b></code> are distinct,
 261  * and in demangled displays they should be presented as
 262  * <code><b>'phase.1'</b></code> and something like
 263  * <code><b>'phase'.1</b></code>, respectively.
 264  * </p>
 265  */
 266 public final class NameCodec {
 267     private NameCodec() {
 268     }
 269 
 270     private static final char ESCAPE_C = '\\';
 271     // empty escape sequence to avoid a null name or illegal prefix
 272     private static final char NULL_ESCAPE_C = '=';
 273     private static final String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;
 274 
 275     /**
 276      * Canonical encoding for the empty name.
 277      */
 278     public static final String EMPTY_NAME =  new String(new char[] { ESCAPE_C, NULL_ESCAPE_C });
 279 
 280     /**
 281      * Encodes ("mangles") an unencoded symbolic name.
 282      * @param name the symbolic name to mangle
 283      * @return the mangled form of the symbolic name.
 284      */
 285     public static String encode(final String name) {
 286         final String bn = mangle(name);
 287         assert((Object)bn == name || looksMangled(bn)) : bn;
 288         assert(name.equals(decode(bn))) : name;
 289         return bn;
 290     }
 291 
 292     /**
 293      * Decodes ("demangles") an encoded symbolic name.
 294      * @param name the symbolic name to demangle
 295      * @return the demangled form of the symbolic name.
 296      */
 297     public static String decode(final String name) {
 298         String sn = name;
 299         if (!sn.isEmpty() && looksMangled(name)) {
 300             sn = demangle(name);
 301             assert(name.equals(mangle(sn))) : name+" => "+sn+" => "+mangle(sn);
 302         }
 303         return sn;
 304     }
 305 
 306     private static boolean looksMangled(final String s) {
 307         return s.charAt(0) == ESCAPE_C;
 308     }
 309 
 310     private static String mangle(final String s) {
 311         if (s.length() == 0)
 312             return NULL_ESCAPE;
 313 
 314         // build this lazily, when we first need an escape:
 315         StringBuilder sb = null;
 316 
 317         for (int i = 0, slen = s.length(); i < slen; i++) {
 318             final char c = s.charAt(i);
 319 
 320             boolean needEscape = false;
 321             if (c == ESCAPE_C) {
 322                 if (i+1 < slen) {
 323                     final char c1 = s.charAt(i+1);
 324                     if ((i == 0 && c1 == NULL_ESCAPE_C)
 325                         || c1 != originalOfReplacement(c1)) {
 326                         // an accidental escape
 327                         needEscape = true;
 328                     }
 329                 }
 330             } else {
 331                 needEscape = isDangerous(c);
 332             }
 333 
 334             if (!needEscape) {
 335                 if (sb != null)  sb.append(c);
 336                 continue;
 337             }
 338 
 339             // build sb if this is the first escape
 340             if (sb == null) {
 341                 sb = new StringBuilder(s.length()+10);
 342                 // mangled names must begin with a backslash:
 343                 if (s.charAt(0) != ESCAPE_C && i > 0)
 344                     sb.append(NULL_ESCAPE);
 345                 // append the string so far, which is unremarkable:
 346                 sb.append(s, 0, i);
 347             }
 348 
 349             // rewrite \ to \-, / to \|, etc.
 350             sb.append(ESCAPE_C);
 351             sb.append(replacementOf(c));
 352         }
 353 
 354         if (sb != null)   return sb.toString();
 355 
 356         return s;
 357     }
 358 
 359     private static String demangle(final String s) {
 360         // build this lazily, when we first meet an escape:
 361         StringBuilder sb = null;
 362 
 363         int stringStart = 0;
 364         if (s.startsWith(NULL_ESCAPE))
 365             stringStart = 2;
 366 
 367         for (int i = stringStart, slen = s.length(); i < slen; i++) {
 368             char c = s.charAt(i);
 369 
 370             if (c == ESCAPE_C && i+1 < slen) {
 371                 // might be an escape sequence
 372                 final char rc = s.charAt(i+1);
 373                 final char oc = originalOfReplacement(rc);
 374                 if (oc != rc) {
 375                     // build sb if this is the first escape
 376                     if (sb == null) {
 377                         sb = new StringBuilder(s.length());
 378                         // append the string so far, which is unremarkable:
 379                         sb.append(s, stringStart, i);
 380                     }
 381                     ++i;  // skip both characters
 382                     c = oc;
 383                 }
 384             }
 385 
 386             if (sb != null)
 387                 sb.append(c);
 388         }
 389 
 390         if (sb != null)   return sb.toString();
 391 
 392         return s.substring(stringStart);
 393     }
 394 
 395     private static final String DANGEROUS_CHARS   = "\\/.;:$[]<>"; // \\ must be first
 396     private static final String REPLACEMENT_CHARS =  "-|,?!%{}^_";
 397     private static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
 398 
 399     private static final long[] SPECIAL_BITMAP = new long[2];  // 128 bits
 400     static {
 401         final String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
 402         for (final char c : SPECIAL.toCharArray()) {
 403             SPECIAL_BITMAP[c >>> 6] |= 1L << c;
 404         }
 405     }
 406 
 407     private static boolean isSpecial(final char c) {
 408         if ((c >>> 6) < SPECIAL_BITMAP.length)
 409             return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
 410         else
 411             return false;
 412     }
 413 
 414     private static char replacementOf(final char c) {
 415         if (!isSpecial(c))  return c;
 416         final int i = DANGEROUS_CHARS.indexOf(c);
 417         if (i < 0)  return c;
 418         return REPLACEMENT_CHARS.charAt(i);
 419     }
 420 
 421     private static char originalOfReplacement(final char c) {
 422         if (!isSpecial(c))  return c;
 423         final int i = REPLACEMENT_CHARS.indexOf(c);
 424         if (i < 0)  return c;
 425         return DANGEROUS_CHARS.charAt(i);
 426     }
 427 
 428     private static boolean isDangerous(final char c) {
 429         if (!isSpecial(c))  return false;
 430         return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
 431     }
 432 }