1 /*
   2  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package sun.nio.cs;
  27 
  28 import java.nio.CharBuffer;
  29 import java.nio.charset.CoderResult;
  30 import java.nio.charset.MalformedInputException;
  31 import java.nio.charset.UnmappableCharacterException;
  32 
  33 /**
  34  * Utility class for dealing with surrogates.
  35  *
  36  * @author Mark Reinhold
  37  * @author Martin Buchholz
  38  * @author Ulf Zibis
  39  */
  40 public class Surrogate {
  41 
  42     private Surrogate() { }
  43 
  44     // TODO: Deprecate/remove the following redundant definitions
  45     public static final char MIN_HIGH = Character.MIN_HIGH_SURROGATE;
  46     public static final char MAX_HIGH = Character.MAX_HIGH_SURROGATE;
  47     public static final char MIN_LOW  = Character.MIN_LOW_SURROGATE;
  48     public static final char MAX_LOW  = Character.MAX_LOW_SURROGATE;
  49     public static final char MIN      = Character.MIN_SURROGATE;
  50     public static final char MAX      = Character.MAX_SURROGATE;
  51     public static final int UCS4_MIN  = Character.MIN_SUPPLEMENTARY_CODE_POINT;
  52     public static final int UCS4_MAX  = Character.MAX_CODE_POINT;
  53 
  54     /**
  55      * Tells whether or not the given value is in the high surrogate range.
  56      * Use of {@link Character#isHighSurrogate} is generally preferred.
  57      */
  58     public static boolean isHigh(int c) {
  59         return (MIN_HIGH <= c) && (c <= MAX_HIGH);
  60     }
  61 
  62     /**
  63      * Tells whether or not the given value is in the low surrogate range.
  64      * Use of {@link Character#isLowSurrogate} is generally preferred.
  65      */
  66     public static boolean isLow(int c) {
  67         return (MIN_LOW <= c) && (c <= MAX_LOW);
  68     }
  69 
  70     /**
  71      * Tells whether or not the given value is in the surrogate range.
  72      * Use of {@link Character#isSurrogate} is generally preferred.
  73      */
  74     public static boolean is(int c) {
  75         return (MIN <= c) && (c <= MAX);
  76     }
  77 
  78     /**
  79      * Tells whether or not the given UCS-4 character must be represented as a
  80      * surrogate pair in UTF-16.
  81      * Use of {@link Character#isSupplementaryCodePoint} is generally preferred.
  82      */
  83     public static boolean neededFor(int uc) {
  84         return Character.isSupplementaryCodePoint(uc);
  85     }
  86 
  87     /**
  88      * Returns the high UTF-16 surrogate for the given supplementary UCS-4 character.
  89      * Use of {@link Character#highSurrogate} is generally preferred.
  90      */
  91     public static char high(int uc) {
  92         assert Character.isSupplementaryCodePoint(uc);
  93         return Character.highSurrogate(uc);
  94     }
  95 
  96     /**
  97      * Returns the low UTF-16 surrogate for the given supplementary UCS-4 character.
  98      * Use of {@link Character#lowSurrogate} is generally preferred.
  99      */
 100     public static char low(int uc) {
 101         assert Character.isSupplementaryCodePoint(uc);
 102         return Character.lowSurrogate(uc);
 103     }
 104 
 105     /**
 106      * Converts the given surrogate pair into a 32-bit UCS-4 character.
 107      * Use of {@link Character#toCodePoint} is generally preferred.
 108      */
 109     public static int toUCS4(char c, char d) {
 110         assert Character.isHighSurrogate(c) && Character.isLowSurrogate(d);
 111         return Character.toCodePoint(c, d);
 112     }
 113 
 114     /**
 115      * Surrogate parsing support.  Charset implementations may use instances of
 116      * this class to handle the details of parsing UTF-16 surrogate pairs.
 117      */
 118     public static class Parser {
 119 
 120         public Parser() { }
 121 
 122         private int character;          // UCS-4
 123         private CoderResult error = CoderResult.UNDERFLOW;
 124         private boolean isPair;
 125 
 126         /**
 127          * Returns the UCS-4 character previously parsed.
 128          */
 129         public int character() {
 130             assert (error == null);
 131             return character;
 132         }
 133 
 134         /**
 135          * Tells whether or not the previously-parsed UCS-4 character was
 136          * originally represented by a surrogate pair.
 137          */
 138         public boolean isPair() {
 139             assert (error == null);
 140             return isPair;
 141         }
 142 
 143         /**
 144          * Returns the number of UTF-16 characters consumed by the previous
 145          * parse.
 146          */
 147         public int increment() {
 148             assert (error == null);
 149             return isPair ? 2 : 1;
 150         }
 151 
 152         /**
 153          * If the previous parse operation detected an error, return the object
 154          * describing that error.
 155          */
 156         public CoderResult error() {
 157             assert (error != null);
 158             return error;
 159         }
 160 
 161         /**
 162          * Returns an unmappable-input result object, with the appropriate
 163          * input length, for the previously-parsed character.
 164          */
 165         public CoderResult unmappableResult() {
 166             assert (error == null);
 167             return CoderResult.unmappableForLength(isPair ? 2 : 1);
 168         }
 169 
 170         /**
 171          * Parses a UCS-4 character from the given source buffer, handling
 172          * surrogates.
 173          *
 174          * @param  c    The first character
 175          * @param  in   The source buffer, from which one more character
 176          *              will be consumed if c is a high surrogate
 177          *
 178          * @return  Either a parsed UCS-4 character, in which case the isPair()
 179          *          and increment() methods will return meaningful values, or
 180          *          -1, in which case error() will return a descriptive result
 181          *          object
 182          */
 183         public int parse(char c, CharBuffer in) {
 184             if (Character.isHighSurrogate(c)) {
 185                 if (!in.hasRemaining()) {
 186                     error = CoderResult.UNDERFLOW;
 187                     return -1;
 188                 }
 189                 char d = in.get();
 190                 if (Character.isLowSurrogate(d)) {
 191                     character = Character.toCodePoint(c, d);
 192                     isPair = true;
 193                     error = null;
 194                     return character;
 195                 }
 196                 error = CoderResult.malformedForLength(1);
 197                 return -1;
 198             }
 199             if (Character.isLowSurrogate(c)) {
 200                 error = CoderResult.malformedForLength(1);
 201                 return -1;
 202             }
 203             character = c;
 204             isPair = false;
 205             error = null;
 206             return character;
 207         }
 208 
 209         /**
 210          * Parses a UCS-4 character from the given source buffer, handling
 211          * surrogates.
 212          *
 213          * @param  c    The first character
 214          * @param  ia   The input array, from which one more character
 215          *              will be consumed if c is a high surrogate
 216          * @param  ip   The input index
 217          * @param  il   The input limit
 218          *
 219          * @return  Either a parsed UCS-4 character, in which case the isPair()
 220          *          and increment() methods will return meaningful values, or
 221          *          -1, in which case error() will return a descriptive result
 222          *          object
 223          */
 224         public int parse(char c, char[] ia, int ip, int il) {
 225             assert (ia[ip] == c);
 226             if (Character.isHighSurrogate(c)) {
 227                 if (il - ip < 2) {
 228                     error = CoderResult.UNDERFLOW;
 229                     return -1;
 230                 }
 231                 char d = ia[ip + 1];
 232                 if (Character.isLowSurrogate(d)) {
 233                     character = Character.toCodePoint(c, d);
 234                     isPair = true;
 235                     error = null;
 236                     return character;
 237                 }
 238                 error = CoderResult.malformedForLength(1);
 239                 return -1;
 240             }
 241             if (Character.isLowSurrogate(c)) {
 242                 error = CoderResult.malformedForLength(1);
 243                 return -1;
 244             }
 245             character = c;
 246             isPair = false;
 247             error = null;
 248             return character;
 249         }
 250 
 251     }
 252 
 253     /**
 254      * Surrogate generation support.  Charset implementations may use instances
 255      * of this class to handle the details of generating UTF-16 surrogate
 256      * pairs.
 257      */
 258     public static class Generator {
 259 
 260         public Generator() { }
 261 
 262         private CoderResult error = CoderResult.OVERFLOW;
 263 
 264         /**
 265          * If the previous generation operation detected an error, return the
 266          * object describing that error.
 267          */
 268         public CoderResult error() {
 269             assert error != null;
 270             return error;
 271         }
 272 
 273         /**
 274          * Generates one or two UTF-16 characters to represent the given UCS-4
 275          * character.
 276          *
 277          * @param  uc   The UCS-4 character
 278          * @param  len  The number of input bytes from which the UCS-4 value
 279          *              was constructed (used when creating result objects)
 280          * @param  dst  The destination buffer, to which one or two UTF-16
 281          *              characters will be written
 282          *
 283          * @return  Either a positive count of the number of UTF-16 characters
 284          *          written to the destination buffer, or -1, in which case
 285          *          error() will return a descriptive result object
 286          */
 287         public int generate(int uc, int len, CharBuffer dst) {
 288             if (Character.isBmpCodePoint(uc)) {
 289                 char c = (char) uc;
 290                 if (Character.isSurrogate(c)) {
 291                     error = CoderResult.malformedForLength(len);
 292                     return -1;
 293                 }
 294                 if (dst.remaining() < 1) {
 295                     error = CoderResult.OVERFLOW;
 296                     return -1;
 297                 }
 298                 dst.put(c);
 299                 error = null;
 300                 return 1;
 301             } else if (Character.isValidCodePoint(uc)) {
 302                 if (dst.remaining() < 2) {
 303                     error = CoderResult.OVERFLOW;
 304                     return -1;
 305                 }
 306                 dst.put(Character.highSurrogate(uc));
 307                 dst.put(Character.lowSurrogate(uc));
 308                 error = null;
 309                 return 2;
 310             } else {
 311                 error = CoderResult.unmappableForLength(len);
 312                 return -1;
 313             }
 314         }
 315 
 316         /**
 317          * Generates one or two UTF-16 characters to represent the given UCS-4
 318          * character.
 319          *
 320          * @param  uc   The UCS-4 character
 321          * @param  len  The number of input bytes from which the UCS-4 value
 322          *              was constructed (used when creating result objects)
 323          * @param  da   The destination array, to which one or two UTF-16
 324          *              characters will be written
 325          * @param  dp   The destination position
 326          * @param  dl   The destination limit
 327          *
 328          * @return  Either a positive count of the number of UTF-16 characters
 329          *          written to the destination buffer, or -1, in which case
 330          *          error() will return a descriptive result object
 331          */
 332         public int generate(int uc, int len, char[] da, int dp, int dl) {
 333             if (Character.isBmpCodePoint(uc)) {
 334                 char c = (char) uc;
 335                 if (Character.isSurrogate(c)) {
 336                     error = CoderResult.malformedForLength(len);
 337                     return -1;
 338                 }
 339                 if (dl - dp < 1) {
 340                     error = CoderResult.OVERFLOW;
 341                     return -1;
 342                 }
 343                 da[dp] = c;
 344                 error = null;
 345                 return 1;
 346             } else if (Character.isValidCodePoint(uc)) {
 347                 if (dl - dp < 2) {
 348                     error = CoderResult.OVERFLOW;
 349                     return -1;
 350                 }
 351                 da[dp] = Character.highSurrogate(uc);
 352                 da[dp + 1] = Character.lowSurrogate(uc);
 353                 error = null;
 354                 return 2;
 355             } else {
 356                 error = CoderResult.unmappableForLength(len);
 357                 return -1;
 358             }
 359         }
 360     }
 361 
 362 }