1 /*
   2  * Copyright (c) 2004, 2007, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /**
  25  *  @test
  26  *  @bug 5033550
  27  *  @summary  JDWP back end uses modified UTF-8
  28  *
  29  *  @author jjh
  30  *
  31  *  @run build TestScaffold VMConnection TargetListener TargetAdapter
  32  *  @run compile -g UTF8Test.java
  33  *  @run driver UTF8Test
  34  */
  35 
  36 /*
  37   There is UTF-8 and there is modified UTF-8, which I will call M-UTF-8.
  38   The two differ in the representation of binary 0, and
  39   in some other more esoteric representations.
  40   See
  41       http://java.sun.com/developer/technicalArticles/Intl/Supplementary/#Modified_UTF-8
  42       http://java.sun.com/javase/6/docs/technotes/guides/jni/spec/types.html#wp16542
  43 
  44   All the following are observations of the treatment
  45   of binary 0.  In UTF-8, this represented as one byte:
  46       0x00
  47 
  48   while in modified UTF-8, it is represented as two bytes
  49       0xc0 0x80
  50 
  51   ** I haven't investigated if the other differences between UTF-8 and
  52      M-UTF-8 are handled in the same way.
  53 
  54  Here is how these our handled in our BE, JDWP, and FE:
  55 
  56  - Strings in .class files are M-UTF-8.
  57 
  58  - To get the value of a string object from the VM, our BE calls
  59       char * utf = JNI_FUNC_PTR(env,GetStringUTFChars)(env, string, NULL);
  60    which returns M-UTF-8.
  61 
  62 - To create a string object in the VM, our BE VirtualMachine.createString() calls
  63       string = JNI_FUNC_PTR(env,NewStringUTF)(env, cstring);
  64       This function expects the string to be M-UTF-8
  65       BUG:  If the string came from JDWP, then it is actually UTF-8
  66 
  67 - I haven't investigated strings in JVMTI.
  68 
  69 - The JDWP spec says that strings are UTF-8.  The intro
  70   says this for all strings, and the createString command and
  71   the StringRefernce.value command say it explicitly.
  72 
  73 - Our FE java writes strings to JDWP as UTF-8.
  74 
  75 - BE function outStream_writeString uses strlen meaning
  76   it expects no 0 bytes, meaning that it expects M-UTF-8
  77   This function writes the byte length and then calls
  78   outStream.c::writeBytes which just writes the bytes to JDWP as is.
  79 
  80   BUG: If such a string came from the VM via JNI, it is actually
  81        M-UTF-8
  82   FIX:  - scan string to see if contains an M-UTF-8 char.
  83           if yes,
  84              - call String(bytes, 0, len, "UTF8")
  85                to get a java string.  Will this work -ie, the
  86                input is M-UTF-8 instead of real UTF-8
  87              - call some java method (NOT JNI which
  88                would just come back with M-UTF-8)
  89                on the String to get real UTF-8
  90 
  91 
  92 - The JDWP StringReference.value command does reads a string
  93   from the BE out of the JDWP stream and does this to
  94   createe a Java String for it (see PacketStream.readString):
  95          String readString() {
  96           String ret;
  97           int len = readInt();
  98 
  99           try {
 100               ret = new String(pkt.data, inCursor, len, "UTF8");
 101           } catch(java.io.UnsupportedEncodingException e) {
 102 
 103   This String ctor converts _both- the M-UTF-8 0xc0 0x80
 104   and UTF-8 0x00  into a Java char containing 0x0000
 105 
 106   Does it do this for the other differences too?
 107 
 108 Summary:
 109 1.  JDWP says strings are UTF-8.
 110     We interpret this to mean standard UTF-8.
 111 
 112 2.  JVMTI will be changed to match JNI saying that strings
 113     are M-UTF-8.
 114 
 115 3.  The BE gets UTF-8 strings off JDWP and must convert them to
 116     M-UTF-8 before giving it to JVMTI or JNI.
 117 
 118 4.  The BE gets M-UTF-8 strings from JNI and JVMTI and
 119     must convert them to UTF-8 when writing to JDWP.
 120 
 121 
 122  Here is how the supplementals are represented in java Strings.
 123  This from java.lang.Character doc:
 124     The Java 2 platform uses the UTF-16 representation in char arrays and
 125     in the String and StringBuffer classes. In this representation,
 126     supplementary characters are represented as a pair of char values,
 127     the first from the high-surrogates range, (\uD800-\uDBFF), the second
 128     from the low-surrogates range (\uDC00-\uDFFF).
 129   See utf8.txt
 130 
 131 
 132 ----
 133 
 134 NSK Packet.java in the nsk/share/jdwp framework does this to write
 135 a string to JDWP:
 136  public void addString(String value) {
 137         final int count = JDWP.TypeSize.INT + value.length();
 138         addInt(value.length());
 139         try {
 140             addBytes(value.getBytes("UTF-8"), 0, value.length());
 141         } catch (UnsupportedEncodingException e) {
 142             throw new Failure("Unsupported UTF-8 ecnoding while adding string value to JDWP packet:\n\t"
 143                                 + e);
 144         }
 145     }
 146  ?? Does this get the standard UTF-8?  I would expect so.
 147 
 148 and the readString method does this:
 149         for (int i = 0; i < len; i++)
 150             s[i] = getByte();
 151 
 152         try {
 153             return new String(s, "UTF-8");
 154         } catch (UnsupportedEncodingException e) {
 155             throw new Failure("Unsupported UTF-8 ecnoding while extracting string value from JDWP packet:\n\t"
 156                                 + e);
 157         }
 158 Thus, this won't notice the modified UTF-8 coming in from JDWP .
 159 
 160 
 161 */
 162 
 163 import com.sun.jdi.*;
 164 import com.sun.jdi.event.*;
 165 import com.sun.jdi.request.*;
 166 import java.io.UnsupportedEncodingException;
 167 import java.util.*;
 168 
 169     /********** target program **********/
 170 
 171 /*
 172  * The debuggee has a few Strings the debugger reads via JDI
 173  */
 174 class UTF8Targ {
 175     static String[] vals = new String[] {"xx\u0000yy",           // standard UTF-8 0
 176                                          "xx\ud800\udc00yy",     // first supplementary
 177                                          "xx\udbff\udfffyy"      // last supplementary
 178                                          // d800 = 1101 1000 0000 0000   dc00 = 1101 1100 0000 0000
 179                                          // dbff = 1101 1011 1111 1111   dfff = 1101 1111 1111 1111
 180     };
 181 
 182     static String aField;
 183 
 184     public static void main(String[] args){
 185         System.out.println("Howdy!");
 186         gus();
 187         System.out.println("Goodbye from UTF8Targ!");
 188     }
 189     static void gus() {
 190     }
 191 }
 192 
 193     /********** test program **********/
 194 
 195 public class UTF8Test extends TestScaffold {
 196     ClassType targetClass;
 197     ThreadReference mainThread;
 198     Field targetField;
 199     UTF8Test (String args[]) {
 200         super(args);
 201     }
 202 
 203     public static void main(String[] args)      throws Exception {
 204         new UTF8Test(args).startTests();
 205     }
 206 
 207     /********** test core **********/
 208 
 209     protected void runTests() throws Exception {
 210         /*
 211          * Get to the top of main()
 212          * to determine targetClass and mainThread
 213          */
 214         BreakpointEvent bpe = startToMain("UTF8Targ");
 215         targetClass = (ClassType)bpe.location().declaringType();
 216         targetField = targetClass.fieldByName("aField");
 217 
 218         ArrayReference targetVals = (ArrayReference)targetClass.getValue(targetClass.fieldByName("vals"));
 219 
 220         /* For each string in the debuggee's 'val' array, verify that we can
 221          * read that value via JDI.
 222          */
 223 
 224         for (int ii = 0; ii < UTF8Targ.vals.length; ii++) {
 225             StringReference val = (StringReference)targetVals.getValue(ii);
 226             String valStr = val.value();
 227 
 228             /*
 229              * Verify that we can read a value correctly.
 230              * We read it via JDI, and access it directly from the static
 231              * var in the debuggee class.
 232              */
 233             if (!valStr.equals(UTF8Targ.vals[ii]) ||
 234                 valStr.length() != UTF8Targ.vals[ii].length()) {
 235                 failure("     FAILED: Expected /" + printIt(UTF8Targ.vals[ii]) +
 236                         "/, but got /" + printIt(valStr) + "/, length = " + valStr.length());
 237             }
 238         }
 239 
 240         /* Test 'all' unicode chars - send them to the debuggee via JDI
 241          * and then read them back.
 242          */
 243         doFancyVersion();
 244 
 245         resumeTo("UTF8Targ", "gus", "()V");
 246         try {
 247             Thread.sleep(1000);
 248         } catch (InterruptedException ee) {
 249         }
 250 
 251 
 252         /*
 253          * resume the target listening for events
 254          */
 255 
 256         listenUntilVMDisconnect();
 257 
 258         /*
 259          * deal with results of test
 260          * if anything has called failure("foo") testFailed will be true
 261          */
 262         if (!testFailed) {
 263             println("UTF8Test: passed");
 264         } else {
 265             throw new Exception("UTF8Test: failed");
 266         }
 267     }
 268 
 269     /**
 270      * For each unicode value, send a string containing
 271      * it to the debuggee via JDI, read it back via JDI, and see if
 272      * we get the same value.
 273      */
 274     void doFancyVersion() throws Exception {
 275         // This does 4 chars at a time just to save time.
 276         for (int ii = Character.MIN_CODE_POINT;
 277              ii < Character.MIN_SUPPLEMENTARY_CODE_POINT;
 278              ii += 4) {
 279             // Skip the surrogates
 280             if (ii == Character.MIN_SURROGATE) {
 281                 ii = Character.MAX_SURROGATE - 3;
 282                 break;
 283             }
 284             doFancyTest(ii, ii + 1, ii + 2, ii + 3);
 285         }
 286 
 287         // Do the supplemental chars.
 288         for (int ii = Character.MIN_SUPPLEMENTARY_CODE_POINT;
 289              ii <= Character.MAX_CODE_POINT;
 290              ii += 2000) {
 291             // Too many of these so just do a few
 292             doFancyTest(ii, ii + 1, ii + 2, ii + 3);
 293         }
 294 
 295     }
 296 
 297     void doFancyTest(int ... args) throws Exception {
 298         String ss = new String(args, 0, 4);
 299         targetClass.setValue(targetField, vm().mirrorOf(ss));
 300 
 301         StringReference returnedVal = (StringReference)targetClass.getValue(targetField);
 302         String returnedStr = returnedVal.value();
 303 
 304         if (!ss.equals(returnedStr)) {
 305             failure("Set: FAILED: Expected /" + printIt(ss) +
 306                     "/, but got /" + printIt(returnedStr) + "/, length = " + returnedStr.length());
 307         }
 308     }
 309 
 310     /**
 311      * Return a String containing binary representations of
 312      * the chars in a String.
 313      */
 314      String printIt(String arg) {
 315         char[] carray = arg.toCharArray();
 316         StringBuffer bb = new StringBuffer(arg.length() * 5);
 317         for (int ii = 0; ii < arg.length(); ii++) {
 318             int ccc = arg.charAt(ii);
 319             bb.append(String.format("%1$04x ", ccc));
 320         }
 321         return bb.toString();
 322     }
 323 
 324     String printIt1(String arg) {
 325         byte[] barray = null;
 326         try {
 327              barray = arg.getBytes("UTF-8");
 328         } catch (UnsupportedEncodingException ee) {
 329         }
 330         StringBuffer bb = new StringBuffer(barray.length * 3);
 331         for (int ii = 0; ii < barray.length; ii++) {
 332             bb.append(String.format("%1$02x ", barray[ii]));
 333         }
 334         return bb.toString();
 335     }
 336 
 337 }