1 /* 2 * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.codemodel.internal.util; 27 28 import java.nio.CharBuffer; 29 import java.nio.charset.CoderResult; 30 31 32 /** 33 * Utility class for dealing with surrogates. 34 * 35 * @author Mark Reinhold 36 * @version 1.11, 03/01/23 37 */ 38 39 class Surrogate { 40 41 private Surrogate() { } 42 43 // UTF-16 surrogate-character ranges 44 // 45 public static final char MIN_HIGH = '\uD800'; 46 public static final char MAX_HIGH = '\uDBFF'; 47 public static final char MIN_LOW = '\uDC00'; 48 public static final char MAX_LOW = '\uDFFF'; 49 public static final char MIN = MIN_HIGH; 50 public static final char MAX = MAX_LOW; 51 52 // Range of UCS-4 values that need surrogates in UTF-16 53 // 54 public static final int UCS4_MIN = 0x10000; 55 public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1; 56 57 /** 58 * Tells whether or not the given UTF-16 value is a high surrogate. 59 */ 60 public static boolean isHigh(int c) { 61 return (MIN_HIGH <= c) && (c <= MAX_HIGH); 62 } 63 64 /** 65 * Tells whether or not the given UTF-16 value is a low surrogate. 66 */ 67 public static boolean isLow(int c) { 68 return (MIN_LOW <= c) && (c <= MAX_LOW); 69 } 70 71 /** 72 * Tells whether or not the given UTF-16 value is a surrogate character, 73 */ 74 public static boolean is(int c) { 75 return (MIN <= c) && (c <= MAX); 76 } 77 78 /** 79 * Tells whether or not the given UCS-4 character must be represented as a 80 * surrogate pair in UTF-16. 81 */ 82 public static boolean neededFor(int uc) { 83 return (uc >= UCS4_MIN) && (uc <= UCS4_MAX); 84 } 85 86 /** 87 * Returns the high UTF-16 surrogate for the given UCS-4 character. 88 */ 89 public static char high(int uc) { 90 return (char)(0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff)); 91 } 92 93 /** 94 * Returns the low UTF-16 surrogate for the given UCS-4 character. 95 */ 96 public static char low(int uc) { 97 return (char)(0xdc00 | ((uc - UCS4_MIN) & 0x3ff)); 98 } 99 100 /** 101 * Converts the given surrogate pair into a 32-bit UCS-4 character. 102 */ 103 public static int toUCS4(char c, char d) { 104 return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000; 105 } 106 107 /** 108 * Surrogate parsing support. Charset implementations may use instances of 109 * this class to handle the details of parsing UTF-16 surrogate pairs. 110 */ 111 public static class Parser { 112 113 public Parser() { } 114 115 private int character; // UCS-4 116 private CoderResult error = CoderResult.UNDERFLOW; 117 private boolean isPair; 118 119 /** 120 * Returns the UCS-4 character previously parsed. 121 */ 122 public int character() { 123 return character; 124 } 125 126 /** 127 * Tells whether or not the previously-parsed UCS-4 character was 128 * originally represented by a surrogate pair. 129 */ 130 public boolean isPair() { 131 return isPair; 132 } 133 134 /** 135 * Returns the number of UTF-16 characters consumed by the previous 136 * parse. 137 */ 138 public int increment() { 139 return isPair ? 2 : 1; 140 } 141 142 /** 143 * If the previous parse operation detected an error, return the object 144 * describing that error. 145 */ 146 public CoderResult error() { 147 return error; 148 } 149 150 /** 151 * Returns an unmappable-input result object, with the appropriate 152 * input length, for the previously-parsed character. 153 */ 154 public CoderResult unmappableResult() { 155 return CoderResult.unmappableForLength(isPair ? 2 : 1); 156 } 157 158 /** 159 * Parses a UCS-4 character from the given source buffer, handling 160 * surrogates. 161 * 162 * @param c The first character 163 * @param in The source buffer, from which one more character 164 * will be consumed if c is a high surrogate 165 * 166 * @return Either a parsed UCS-4 character, in which case the isPair() 167 * and increment() methods will return meaningful values, or 168 * -1, in which case error() will return a descriptive result 169 * object 170 */ 171 public int parse(char c, CharBuffer in) { 172 if (isHigh(c)) { 173 if (!in.hasRemaining()) { 174 error = CoderResult.UNDERFLOW; 175 return -1; 176 } 177 char d = in.get(); 178 if (isLow(d)) { 179 character = toUCS4(c, d); 180 isPair = true; 181 error = null; 182 return character; 183 } 184 error = CoderResult.malformedForLength(1); 185 return -1; 186 } 187 if (isLow(c)) { 188 error = CoderResult.malformedForLength(1); 189 return -1; 190 } 191 character = c; 192 isPair = false; 193 error = null; 194 return character; 195 } 196 197 /** 198 * Parses a UCS-4 character from the given source buffer, handling 199 * surrogates. 200 * 201 * @param c The first character 202 * @param ia The input array, from which one more character 203 * will be consumed if c is a high surrogate 204 * @param ip The input index 205 * @param il The input limit 206 * 207 * @return Either a parsed UCS-4 character, in which case the isPair() 208 * and increment() methods will return meaningful values, or 209 * -1, in which case error() will return a descriptive result 210 * object 211 */ 212 public int parse(char c, char[] ia, int ip, int il) { 213 if (isHigh(c)) { 214 if (il - ip < 2) { 215 error = CoderResult.UNDERFLOW; 216 return -1; 217 } 218 char d = ia[ip + 1]; 219 if (isLow(d)) { 220 character = toUCS4(c, d); 221 isPair = true; 222 error = null; 223 return character; 224 } 225 error = CoderResult.malformedForLength(1); 226 return -1; 227 } 228 if (isLow(c)) { 229 error = CoderResult.malformedForLength(1); 230 return -1; 231 } 232 character = c; 233 isPair = false; 234 error = null; 235 return character; 236 } 237 238 } 239 240 /** 241 * Surrogate generation support. Charset implementations may use instances 242 * of this class to handle the details of generating UTF-16 surrogate 243 * pairs. 244 */ 245 public static class Generator { 246 247 public Generator() { } 248 249 private CoderResult error = CoderResult.OVERFLOW; 250 251 /** 252 * If the previous generation operation detected an error, return the 253 * object describing that error. 254 */ 255 public CoderResult error() { 256 return error; 257 } 258 259 /** 260 * Generates one or two UTF-16 characters to represent the given UCS-4 261 * character. 262 * 263 * @param uc The UCS-4 character 264 * @param len The number of input bytes from which the UCS-4 value 265 * was constructed (used when creating result objects) 266 * @param dst The destination buffer, to which one or two UTF-16 267 * characters will be written 268 * 269 * @return Either a positive count of the number of UTF-16 characters 270 * written to the destination buffer, or -1, in which case 271 * error() will return a descriptive result object 272 */ 273 public int generate(int uc, int len, CharBuffer dst) { 274 if (uc <= 0xffff) { 275 if (is(uc)) { 276 error = CoderResult.malformedForLength(len); 277 return -1; 278 } 279 if (dst.remaining() < 1) { 280 error = CoderResult.OVERFLOW; 281 return -1; 282 } 283 dst.put((char)uc); 284 error = null; 285 return 1; 286 } 287 if (uc < UCS4_MIN) { 288 error = CoderResult.malformedForLength(len); 289 return -1; 290 } 291 if (uc <= UCS4_MAX) { 292 if (dst.remaining() < 2) { 293 error = CoderResult.OVERFLOW; 294 return -1; 295 } 296 dst.put(high(uc)); 297 dst.put(low(uc)); 298 error = null; 299 return 2; 300 } 301 error = CoderResult.unmappableForLength(len); 302 return -1; 303 } 304 305 /** 306 * Generates one or two UTF-16 characters to represent the given UCS-4 307 * character. 308 * 309 * @param uc The UCS-4 character 310 * @param len The number of input bytes from which the UCS-4 value 311 * was constructed (used when creating result objects) 312 * @param da The destination array, to which one or two UTF-16 313 * characters will be written 314 * @param dp The destination position 315 * @param dl The destination limit 316 * 317 * @return Either a positive count of the number of UTF-16 characters 318 * written to the destination buffer, or -1, in which case 319 * error() will return a descriptive result object 320 */ 321 public int generate(int uc, int len, char[] da, int dp, int dl) { 322 if (uc <= 0xffff) { 323 if (is(uc)) { 324 error = CoderResult.malformedForLength(len); 325 return -1; 326 } 327 if (dl - dp < 1) { 328 error = CoderResult.OVERFLOW; 329 return -1; 330 } 331 da[dp] = (char)uc; 332 error = null; 333 return 1; 334 } 335 if (uc < UCS4_MIN) { 336 error = CoderResult.malformedForLength(len); 337 return -1; 338 } 339 if (uc <= UCS4_MAX) { 340 if (dl - dp < 2) { 341 error = CoderResult.OVERFLOW; 342 return -1; 343 } 344 da[dp] = high(uc); 345 da[dp + 1] = low(uc); 346 error = null; 347 return 2; 348 } 349 error = CoderResult.unmappableForLength(len); 350 return -1; 351 } 352 353 } 354 355 }