1 /*
   2  * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.codemodel.internal.util;
  27 
  28 import java.nio.CharBuffer;
  29 import java.nio.charset.CoderResult;
  30 
  31 
  32 /**
  33  * Utility class for dealing with surrogates.
  34  *
  35  * @author Mark Reinhold
  36  * @version 1.11, 03/01/23
  37  */
  38 
  39 class Surrogate {
  40 
  41     private Surrogate() { }
  42 
  43     // UTF-16 surrogate-character ranges
  44     //
  45     public static final char MIN_HIGH = '\uD800';
  46     public static final char MAX_HIGH = '\uDBFF';
  47     public static final char MIN_LOW  = '\uDC00';
  48     public static final char MAX_LOW  = '\uDFFF';
  49     public static final char MIN = MIN_HIGH;
  50     public static final char MAX = MAX_LOW;
  51 
  52     // Range of UCS-4 values that need surrogates in UTF-16
  53     //
  54     public static final int UCS4_MIN = 0x10000;
  55     public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
  56 
  57     /**
  58      * Tells whether or not the given UTF-16 value is a high surrogate.
  59      */
  60     public static boolean isHigh(int c) {
  61         return (MIN_HIGH <= c) && (c <= MAX_HIGH);
  62     }
  63 
  64     /**
  65      * Tells whether or not the given UTF-16 value is a low surrogate.
  66      */
  67     public static boolean isLow(int c) {
  68         return (MIN_LOW <= c) && (c <= MAX_LOW);
  69     }
  70 
  71     /**
  72      * Tells whether or not the given UTF-16 value is a surrogate character,
  73      */
  74     public static boolean is(int c) {
  75         return (MIN <= c) && (c <= MAX);
  76     }
  77 
  78     /**
  79      * Tells whether or not the given UCS-4 character must be represented as a
  80      * surrogate pair in UTF-16.
  81      */
  82     public static boolean neededFor(int uc) {
  83         return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
  84     }
  85 
  86     /**
  87      * Returns the high UTF-16 surrogate for the given UCS-4 character.
  88      */
  89     public static char high(int uc) {
  90         return (char)(0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
  91     }
  92 
  93     /**
  94      * Returns the low UTF-16 surrogate for the given UCS-4 character.
  95      */
  96     public static char low(int uc) {
  97         return (char)(0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
  98     }
  99 
 100     /**
 101      * Converts the given surrogate pair into a 32-bit UCS-4 character.
 102      */
 103     public static int toUCS4(char c, char d) {
 104         return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
 105     }
 106 
 107     /**
 108      * Surrogate parsing support.  Charset implementations may use instances of
 109      * this class to handle the details of parsing UTF-16 surrogate pairs.
 110      */
 111     public static class Parser {
 112 
 113         public Parser() { }
 114 
 115         private int character;          // UCS-4
 116         private CoderResult error = CoderResult.UNDERFLOW;
 117         private boolean isPair;
 118 
 119         /**
 120          * Returns the UCS-4 character previously parsed.
 121          */
 122         public int character() {
 123             return character;
 124         }
 125 
 126         /**
 127          * Tells whether or not the previously-parsed UCS-4 character was
 128          * originally represented by a surrogate pair.
 129          */
 130         public boolean isPair() {
 131             return isPair;
 132         }
 133 
 134         /**
 135          * Returns the number of UTF-16 characters consumed by the previous
 136          * parse.
 137          */
 138         public int increment() {
 139             return isPair ? 2 : 1;
 140         }
 141 
 142         /**
 143          * If the previous parse operation detected an error, return the object
 144          * describing that error.
 145          */
 146         public CoderResult error() {
 147             return error;
 148         }
 149 
 150         /**
 151          * Returns an unmappable-input result object, with the appropriate
 152          * input length, for the previously-parsed character.
 153          */
 154         public CoderResult unmappableResult() {
 155             return CoderResult.unmappableForLength(isPair ? 2 : 1);
 156         }
 157 
 158         /**
 159          * Parses a UCS-4 character from the given source buffer, handling
 160          * surrogates.
 161          *
 162          * @param  c    The first character
 163          * @param  in   The source buffer, from which one more character
 164          *              will be consumed if c is a high surrogate
 165          *
 166          * @return   Either a parsed UCS-4 character, in which case the isPair()
 167          *           and increment() methods will return meaningful values, or
 168          *           -1, in which case error() will return a descriptive result
 169          *           object
 170          */
 171         public int parse(char c, CharBuffer in) {
 172             if (isHigh(c)) {
 173                 if (!in.hasRemaining()) {
 174                     error = CoderResult.UNDERFLOW;
 175                     return -1;
 176                 }
 177                 char d = in.get();
 178                 if (isLow(d)) {
 179                     character = toUCS4(c, d);
 180                     isPair = true;
 181                     error = null;
 182                     return character;
 183                 }
 184                 error = CoderResult.malformedForLength(1);
 185                 return -1;
 186             }
 187             if (isLow(c)) {
 188                 error = CoderResult.malformedForLength(1);
 189                 return -1;
 190             }
 191             character = c;
 192             isPair = false;
 193             error = null;
 194             return character;
 195         }
 196 
 197         /**
 198          * Parses a UCS-4 character from the given source buffer, handling
 199          * surrogates.
 200          *
 201          * @param  c    The first character
 202          * @param  ia   The input array, from which one more character
 203          *              will be consumed if c is a high surrogate
 204          * @param  ip   The input index
 205          * @param  il   The input limit
 206          *
 207          * @return   Either a parsed UCS-4 character, in which case the isPair()
 208          *           and increment() methods will return meaningful values, or
 209          *           -1, in which case error() will return a descriptive result
 210          *           object
 211          */
 212         public int parse(char c, char[] ia, int ip, int il) {
 213             if (isHigh(c)) {
 214                 if (il - ip < 2) {
 215                     error = CoderResult.UNDERFLOW;
 216                     return -1;
 217                 }
 218                 char d = ia[ip + 1];
 219                 if (isLow(d)) {
 220                     character = toUCS4(c, d);
 221                     isPair = true;
 222                     error = null;
 223                     return character;
 224                 }
 225                 error = CoderResult.malformedForLength(1);
 226                 return -1;
 227             }
 228             if (isLow(c)) {
 229                 error = CoderResult.malformedForLength(1);
 230                 return -1;
 231             }
 232             character = c;
 233             isPair = false;
 234             error = null;
 235             return character;
 236         }
 237 
 238     }
 239 
 240     /**
 241      * Surrogate generation support.  Charset implementations may use instances
 242      * of this class to handle the details of generating UTF-16 surrogate
 243      * pairs.
 244      */
 245     public static class Generator {
 246 
 247         public Generator() { }
 248 
 249         private CoderResult error = CoderResult.OVERFLOW;
 250 
 251         /**
 252          * If the previous generation operation detected an error, return the
 253          * object describing that error.
 254          */
 255         public CoderResult error() {
 256             return error;
 257         }
 258 
 259         /**
 260          * Generates one or two UTF-16 characters to represent the given UCS-4
 261          * character.
 262          *
 263          * @param  uc   The UCS-4 character
 264          * @param  len  The number of input bytes from which the UCS-4 value
 265          *              was constructed (used when creating result objects)
 266          * @param  dst  The destination buffer, to which one or two UTF-16
 267          *              characters will be written
 268          *
 269          * @return   Either a positive count of the number of UTF-16 characters
 270          *           written to the destination buffer, or -1, in which case
 271          *           error() will return a descriptive result object
 272          */
 273         public int generate(int uc, int len, CharBuffer dst) {
 274             if (uc <= 0xffff) {
 275                 if (is(uc)) {
 276                     error = CoderResult.malformedForLength(len);
 277                     return -1;
 278                 }
 279                 if (dst.remaining() < 1) {
 280                     error = CoderResult.OVERFLOW;
 281                     return -1;
 282                 }
 283                 dst.put((char)uc);
 284                 error = null;
 285                 return 1;
 286             }
 287             if (uc < UCS4_MIN) {
 288                 error = CoderResult.malformedForLength(len);
 289                 return -1;
 290             }
 291             if (uc <= UCS4_MAX) {
 292                 if (dst.remaining() < 2) {
 293                     error = CoderResult.OVERFLOW;
 294                     return -1;
 295                 }
 296                 dst.put(high(uc));
 297                 dst.put(low(uc));
 298                 error = null;
 299                 return 2;
 300             }
 301             error = CoderResult.unmappableForLength(len);
 302             return -1;
 303         }
 304 
 305         /**
 306          * Generates one or two UTF-16 characters to represent the given UCS-4
 307          * character.
 308          *
 309          * @param  uc   The UCS-4 character
 310          * @param  len  The number of input bytes from which the UCS-4 value
 311          *              was constructed (used when creating result objects)
 312          * @param  da   The destination array, to which one or two UTF-16
 313          *              characters will be written
 314          * @param  dp   The destination position
 315          * @param  dl   The destination limit
 316          *
 317          * @return   Either a positive count of the number of UTF-16 characters
 318          *           written to the destination buffer, or -1, in which case
 319          *           error() will return a descriptive result object
 320          */
 321         public int generate(int uc, int len, char[] da, int dp, int dl) {
 322             if (uc <= 0xffff) {
 323                 if (is(uc)) {
 324                     error = CoderResult.malformedForLength(len);
 325                     return -1;
 326                 }
 327                 if (dl - dp < 1) {
 328                     error = CoderResult.OVERFLOW;
 329                     return -1;
 330                 }
 331                 da[dp] = (char)uc;
 332                 error = null;
 333                 return 1;
 334             }
 335             if (uc < UCS4_MIN) {
 336                 error = CoderResult.malformedForLength(len);
 337                 return -1;
 338             }
 339             if (uc <= UCS4_MAX) {
 340                 if (dl - dp < 2) {
 341                     error = CoderResult.OVERFLOW;
 342                     return -1;
 343                 }
 344                 da[dp] = high(uc);
 345                 da[dp + 1] = low(uc);
 346                 error = null;
 347                 return 2;
 348             }
 349             error = CoderResult.unmappableForLength(len);
 350             return -1;
 351         }
 352 
 353     }
 354 
 355 }