1 /* 2 * Copyright (c) 2004, 2007, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /** 25 * @test 26 * @bug 5033550 27 * @summary JDWP back end uses modified UTF-8 28 * 29 * @author jjh 30 * 31 * @run build TestScaffold VMConnection TargetListener TargetAdapter 32 * @run compile -g UTF8Test.java 33 * @run driver UTF8Test 34 */ 35 36 /* 37 There is UTF-8 and there is modified UTF-8, which I will call M-UTF-8. 38 The two differ in the representation of binary 0, and 39 in some other more esoteric representations. 40 See 41 http://java.sun.com/developer/technicalArticles/Intl/Supplementary/#Modified_UTF-8 42 http://java.sun.com/javase/6/docs/technotes/guides/jni/spec/types.html#wp16542 43 44 All the following are observations of the treatment 45 of binary 0. In UTF-8, this represented as one byte: 46 0x00 47 48 while in modified UTF-8, it is represented as two bytes 49 0xc0 0x80 50 51 ** I haven't investigated if the other differences between UTF-8 and 52 M-UTF-8 are handled in the same way. 53 54 Here is how these our handled in our BE, JDWP, and FE: 55 56 - Strings in .class files are M-UTF-8. 57 58 - To get the value of a string object from the VM, our BE calls 59 char * utf = JNI_FUNC_PTR(env,GetStringUTFChars)(env, string, NULL); 60 which returns M-UTF-8. 61 62 - To create a string object in the VM, our BE VirtualMachine.createString() calls 63 string = JNI_FUNC_PTR(env,NewStringUTF)(env, cstring); 64 This function expects the string to be M-UTF-8 65 BUG: If the string came from JDWP, then it is actually UTF-8 66 67 - I haven't investigated strings in JVMTI. 68 69 - The JDWP spec says that strings are UTF-8. The intro 70 says this for all strings, and the createString command and 71 the StringRefernce.value command say it explicitly. 72 73 - Our FE java writes strings to JDWP as UTF-8. 74 75 - BE function outStream_writeString uses strlen meaning 76 it expects no 0 bytes, meaning that it expects M-UTF-8 77 This function writes the byte length and then calls 78 outStream.c::writeBytes which just writes the bytes to JDWP as is. 79 80 BUG: If such a string came from the VM via JNI, it is actually 81 M-UTF-8 82 FIX: - scan string to see if contains an M-UTF-8 char. 83 if yes, 84 - call String(bytes, 0, len, "UTF8") 85 to get a java string. Will this work -ie, the 86 input is M-UTF-8 instead of real UTF-8 87 - call some java method (NOT JNI which 88 would just come back with M-UTF-8) 89 on the String to get real UTF-8 90 91 92 - The JDWP StringReference.value command does reads a string 93 from the BE out of the JDWP stream and does this to 94 createe a Java String for it (see PacketStream.readString): 95 String readString() { 96 String ret; 97 int len = readInt(); 98 99 try { 100 ret = new String(pkt.data, inCursor, len, "UTF8"); 101 } catch(java.io.UnsupportedEncodingException e) { 102 103 This String ctor converts _both- the M-UTF-8 0xc0 0x80 104 and UTF-8 0x00 into a Java char containing 0x0000 105 106 Does it do this for the other differences too? 107 108 Summary: 109 1. JDWP says strings are UTF-8. 110 We interpret this to mean standard UTF-8. 111 112 2. JVMTI will be changed to match JNI saying that strings 113 are M-UTF-8. 114 115 3. The BE gets UTF-8 strings off JDWP and must convert them to 116 M-UTF-8 before giving it to JVMTI or JNI. 117 118 4. The BE gets M-UTF-8 strings from JNI and JVMTI and 119 must convert them to UTF-8 when writing to JDWP. 120 121 122 Here is how the supplementals are represented in java Strings. 123 This from java.lang.Character doc: 124 The Java 2 platform uses the UTF-16 representation in char arrays and 125 in the String and StringBuffer classes. In this representation, 126 supplementary characters are represented as a pair of char values, 127 the first from the high-surrogates range, (\uD800-\uDBFF), the second 128 from the low-surrogates range (\uDC00-\uDFFF). 129 See utf8.txt 130 131 132 ---- 133 134 NSK Packet.java in the nsk/share/jdwp framework does this to write 135 a string to JDWP: 136 public void addString(String value) { 137 final int count = JDWP.TypeSize.INT + value.length(); 138 addInt(value.length()); 139 try { 140 addBytes(value.getBytes("UTF-8"), 0, value.length()); 141 } catch (UnsupportedEncodingException e) { 142 throw new Failure("Unsupported UTF-8 ecnoding while adding string value to JDWP packet:\n\t" 143 + e); 144 } 145 } 146 ?? Does this get the standard UTF-8? I would expect so. 147 148 and the readString method does this: 149 for (int i = 0; i < len; i++) 150 s[i] = getByte(); 151 152 try { 153 return new String(s, "UTF-8"); 154 } catch (UnsupportedEncodingException e) { 155 throw new Failure("Unsupported UTF-8 ecnoding while extracting string value from JDWP packet:\n\t" 156 + e); 157 } 158 Thus, this won't notice the modified UTF-8 coming in from JDWP . 159 160 161 */ 162 163 import com.sun.jdi.*; 164 import com.sun.jdi.event.*; 165 import com.sun.jdi.request.*; 166 import java.io.UnsupportedEncodingException; 167 import java.util.*; 168 169 /********** target program **********/ 170 171 /* 172 * The debuggee has a few Strings the debugger reads via JDI 173 */ 174 class UTF8Targ { 175 static String[] vals = new String[] {"xx\u0000yy", // standard UTF-8 0 176 "xx\ud800\udc00yy", // first supplementary 177 "xx\udbff\udfffyy" // last supplementary 178 // d800 = 1101 1000 0000 0000 dc00 = 1101 1100 0000 0000 179 // dbff = 1101 1011 1111 1111 dfff = 1101 1111 1111 1111 180 }; 181 182 static String aField; 183 184 public static void main(String[] args){ 185 System.out.println("Howdy!"); 186 gus(); 187 System.out.println("Goodbye from UTF8Targ!"); 188 } 189 static void gus() { 190 } 191 } 192 193 /********** test program **********/ 194 195 public class UTF8Test extends TestScaffold { 196 ClassType targetClass; 197 ThreadReference mainThread; 198 Field targetField; 199 UTF8Test (String args[]) { 200 super(args); 201 } 202 203 public static void main(String[] args) throws Exception { 204 new UTF8Test(args).startTests(); 205 } 206 207 /********** test core **********/ 208 209 protected void runTests() throws Exception { 210 /* 211 * Get to the top of main() 212 * to determine targetClass and mainThread 213 */ 214 BreakpointEvent bpe = startToMain("UTF8Targ"); 215 targetClass = (ClassType)bpe.location().declaringType(); 216 targetField = targetClass.fieldByName("aField"); 217 218 ArrayReference targetVals = (ArrayReference)targetClass.getValue(targetClass.fieldByName("vals")); 219 220 /* For each string in the debuggee's 'val' array, verify that we can 221 * read that value via JDI. 222 */ 223 224 for (int ii = 0; ii < UTF8Targ.vals.length; ii++) { 225 StringReference val = (StringReference)targetVals.getValue(ii); 226 String valStr = val.value(); 227 228 /* 229 * Verify that we can read a value correctly. 230 * We read it via JDI, and access it directly from the static 231 * var in the debuggee class. 232 */ 233 if (!valStr.equals(UTF8Targ.vals[ii]) || 234 valStr.length() != UTF8Targ.vals[ii].length()) { 235 failure(" FAILED: Expected /" + printIt(UTF8Targ.vals[ii]) + 236 "/, but got /" + printIt(valStr) + "/, length = " + valStr.length()); 237 } 238 } 239 240 /* Test 'all' unicode chars - send them to the debuggee via JDI 241 * and then read them back. 242 */ 243 doFancyVersion(); 244 245 resumeTo("UTF8Targ", "gus", "()V"); 246 try { 247 Thread.sleep(1000); 248 } catch (InterruptedException ee) { 249 } 250 251 252 /* 253 * resume the target listening for events 254 */ 255 256 listenUntilVMDisconnect(); 257 258 /* 259 * deal with results of test 260 * if anything has called failure("foo") testFailed will be true 261 */ 262 if (!testFailed) { 263 println("UTF8Test: passed"); 264 } else { 265 throw new Exception("UTF8Test: failed"); 266 } 267 } 268 269 /** 270 * For each unicode value, send a string containing 271 * it to the debuggee via JDI, read it back via JDI, and see if 272 * we get the same value. 273 */ 274 void doFancyVersion() throws Exception { 275 // This does 4 chars at a time just to save time. 276 for (int ii = Character.MIN_CODE_POINT; 277 ii < Character.MIN_SUPPLEMENTARY_CODE_POINT; 278 ii += 4) { 279 // Skip the surrogates 280 if (ii == Character.MIN_SURROGATE) { 281 ii = Character.MAX_SURROGATE - 3; 282 break; 283 } 284 doFancyTest(ii, ii + 1, ii + 2, ii + 3); 285 } 286 287 // Do the supplemental chars. 288 for (int ii = Character.MIN_SUPPLEMENTARY_CODE_POINT; 289 ii <= Character.MAX_CODE_POINT; 290 ii += 2000) { 291 // Too many of these so just do a few 292 doFancyTest(ii, ii + 1, ii + 2, ii + 3); 293 } 294 295 } 296 297 void doFancyTest(int ... args) throws Exception { 298 String ss = new String(args, 0, 4); 299 targetClass.setValue(targetField, vm().mirrorOf(ss)); 300 301 StringReference returnedVal = (StringReference)targetClass.getValue(targetField); 302 String returnedStr = returnedVal.value(); 303 304 if (!ss.equals(returnedStr)) { 305 failure("Set: FAILED: Expected /" + printIt(ss) + 306 "/, but got /" + printIt(returnedStr) + "/, length = " + returnedStr.length()); 307 } 308 } 309 310 /** 311 * Return a String containing binary representations of 312 * the chars in a String. 313 */ 314 String printIt(String arg) { 315 char[] carray = arg.toCharArray(); 316 StringBuffer bb = new StringBuffer(arg.length() * 5); 317 for (int ii = 0; ii < arg.length(); ii++) { 318 int ccc = arg.charAt(ii); 319 bb.append(String.format("%1$04x ", ccc)); 320 } 321 return bb.toString(); 322 } 323 324 String printIt1(String arg) { 325 byte[] barray = null; 326 try { 327 barray = arg.getBytes("UTF-8"); 328 } catch (UnsupportedEncodingException ee) { 329 } 330 StringBuffer bb = new StringBuffer(barray.length * 3); 331 for (int ii = 0; ii < barray.length; ii++) { 332 bb.append(String.format("%1$02x ", barray[ii])); 333 } 334 return bb.toString(); 335 } 336 337 }