1 /*
2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 /*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37 package sun.text.normalizer;
38
39 /**
40 * <p>Standalone utility class providing UTF16 character conversions and
41 * indexing conversions.
42 * <p>Code that uses strings alone rarely need modification.
43 * By design, UTF-16 does not allow overlap, so searching for strings is a safe
44 * operation. Similarly, concatenation is always safe. Substringing is safe if
45 * the start and end are both on UTF-32 boundaries. In normal code, the values
46 * for start and end are on those boundaries, since they arose from operations
47 * like searching. If not, the nearest UTF-32 boundaries can be determined
48 * using <code>bounds()</code>.
49 * <strong>Examples:</strong>
50 * <p>The following examples illustrate use of some of these methods.
51 * <pre>{@code
52 * // iteration forwards: Original
53 * for (int i = 0; i < s.length(); ++i) {
54 * char ch = s.charAt(i);
55 * doSomethingWith(ch);
56 * }
57 *
58 * // iteration forwards: Changes for UTF-32
59 * int ch;
60 * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
61 * ch = UTF16.charAt(s,i);
62 * doSomethingWith(ch);
63 * }
64 *
65 * // iteration backwards: Original
66 * for (int i = s.length() -1; i >= 0; --i) {
67 * char ch = s.charAt(i);
68 * doSomethingWith(ch);
69 * }
70 *
71 * // iteration backwards: Changes for UTF-32
72 * int ch;
73 * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
74 * ch = UTF16.charAt(s,i);
75 * doSomethingWith(ch);
76 * }
77 * }</pre>
78 * <strong>Notes:</strong>
79 * <ul>
80 * <li>
81 * <strong>Naming:</strong> For clarity, High and Low surrogates are called
82 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
83 * sense of their ordering in a string. <code>offset16</code> and
84 * <code>offset32</code> are used to distinguish offsets to UTF-16
85 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
86 * used to contain UTF-32 characters, as opposed to <code>char16</code>,
87 * which is a UTF-16 code unit.
88 * </li>
89 * <li>
90 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
91 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
92 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
93 * back if and only if <code>bounds(string, offset16) != TRAIL</code>.
94 * </li>
144 /**
145 * Trail surrogate minimum value
146 * @stable ICU 2.1
147 */
148 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
149 /**
150 * Lead surrogate maximum value
151 * @stable ICU 2.1
152 */
153 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
154 /**
155 * Trail surrogate maximum value
156 * @stable ICU 2.1
157 */
158 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
159 /**
160 * Surrogate minimum value
161 * @stable ICU 2.1
162 */
163 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
164
165 // public method ------------------------------------------------------
166
167 /**
168 * Extract a single UTF-32 value from a string.
169 * Used when iterating forwards or backwards (with
170 * <code>UTF16.getCharCount()</code>, as well as random access. If a
171 * validity check is required, use
172 * <code><a href="../lang/UCharacter.html#isLegal(char)">
173 * UCharacter.isLegal()</a></code> on the return value.
174 * If the char retrieved is part of a surrogate pair, its supplementary
175 * character will be returned. If a complete supplementary character is
176 * not found the incomplete character will be returned
177 * @param source array of UTF-16 chars
178 * @param offset16 UTF-16 offset to the start of the character.
179 * @return UTF-32 value for the UTF-32 value that contains the char at
180 * offset16. The boundaries of that codepoint are the same as in
181 * <code>bounds32()</code>.
182 * @exception IndexOutOfBoundsException thrown if offset16 is out of
183 * bounds.
184 * @stable ICU 2.1
205 if (source.length() != offset16) {
206 char trail = source.charAt(offset16);
207 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
208 return UCharacterProperty.getRawSupplementary(single, trail);
209 }
210 }
211 } else {
212 --offset16;
213 if (offset16 >= 0) {
214 // single is a trail surrogate so
215 char lead = source.charAt(offset16);
216 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
217 return UCharacterProperty.getRawSupplementary(lead, single);
218 }
219 }
220 }
221 return single; // return unmatched surrogate
222 }
223
224 /**
225 * Extract a single UTF-32 value from a substring.
226 * Used when iterating forwards or backwards (with
227 * <code>UTF16.getCharCount()</code>, as well as random access. If a
228 * validity check is required, use
229 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
230 * </a></code> on the return value.
231 * If the char retrieved is part of a surrogate pair, its supplementary
232 * character will be returned. If a complete supplementary character is
233 * not found the incomplete character will be returned
234 * @param source array of UTF-16 chars
235 * @param start offset to substring in the source array for analyzing
236 * @param limit offset to substring in the source array for analyzing
237 * @param offset16 UTF-16 offset relative to start
238 * @return UTF-32 value for the UTF-32 value that contains the char at
239 * offset16. The boundaries of that codepoint are the same as in
240 * <code>bounds32()</code>.
241 * @exception IndexOutOfBoundsException thrown if offset16 is not within
242 * the range of start and limit.
243 * @stable ICU 2.1
244 */
245 public static int charAt(char source[], int start, int limit,
246 int offset16)
247 {
248 offset16 += start;
249 if (offset16 < start || offset16 >= limit) {
250 throw new ArrayIndexOutOfBoundsException(offset16);
251 }
252
253 char single = source[offset16];
254 if (!isSurrogate(single)) {
255 return single;
256 }
257
258 // Convert the UTF-16 surrogate pair if necessary.
259 // For simplicity in usage, and because the frequency of pairs is
260 // low, look both directions.
261 if (single <= LEAD_SURROGATE_MAX_VALUE) {
262 offset16 ++;
263 if (offset16 >= limit) {
264 return single;
265 }
266 char trail = source[offset16];
267 if (isTrailSurrogate(trail)) {
268 return UCharacterProperty.getRawSupplementary(single, trail);
269 }
270 }
271 else { // isTrailSurrogate(single), so
272 if (offset16 == start) {
273 return single;
274 }
275 offset16 --;
276 char lead = source[offset16];
277 if (isLeadSurrogate(lead))
278 return UCharacterProperty.getRawSupplementary(lead, single);
279 }
280 return single; // return unmatched surrogate
281 }
282
283 /**
284 * Determines how many chars this char32 requires.
285 * If a validity check is required, use <code>
286 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
287 * char32 before calling.
288 * @param char32 the input codepoint.
289 * @return 2 if is in supplementary space, otherwise 1.
290 * @stable ICU 2.1
291 */
292 public static int getCharCount(int char32)
293 {
294 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
295 return 1;
296 }
297 return 2;
298 }
299
300 /**
301 * Determines whether the code value is a surrogate.
302 * @param char16 the input character.
303 * @return true iff the input character is a surrogate.
304 * @stable ICU 2.1
305 */
306 public static boolean isSurrogate(char char16)
307 {
308 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
309 char16 <= TRAIL_SURROGATE_MAX_VALUE;
310 }
311
312 /**
313 * Determines whether the character is a trail surrogate.
314 * @param char16 the input character.
315 * @return true iff the input character is a trail surrogate.
316 * @stable ICU 2.1
317 */
318 public static boolean isTrailSurrogate(char char16)
319 {
320 return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
321 char16 <= TRAIL_SURROGATE_MAX_VALUE);
322 }
323
324 /**
325 * Determines whether the character is a lead surrogate.
326 * @param char16 the input character.
327 * @return true iff the input character is a lead surrogate
328 * @stable ICU 2.1
329 */
330 public static boolean isLeadSurrogate(char char16)
331 {
332 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
333 char16 <= LEAD_SURROGATE_MAX_VALUE;
334 }
335
336 /**
337 * Returns the lead surrogate.
338 * If a validity check is required, use
339 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
340 * on char32 before calling.
341 * @param char32 the input character.
342 * @return lead surrogate if the getCharCount(ch) is 2; <br>
343 * and 0 otherwise (note: 0 is not a valid lead surrogate).
344 * @stable ICU 2.1
345 */
346 public static char getLeadSurrogate(int char32)
347 {
348 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
349 return (char)(LEAD_SURROGATE_OFFSET_ +
350 (char32 >> LEAD_SURROGATE_SHIFT_));
351 }
352
353 return 0;
354 }
355
356 /**
357 * Returns the trail surrogate.
358 * If a validity check is required, use
359 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
360 * on char32 before calling.
361 * @param char32 the input character.
362 * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
363 * the character itself
364 * @stable ICU 2.1
365 */
366 public static char getTrailSurrogate(int char32)
367 {
368 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
369 return (char)(TRAIL_SURROGATE_MIN_VALUE +
370 (char32 & TRAIL_SURROGATE_MASK_));
371 }
372
373 return (char)char32;
374 }
375
376 /**
377 * Convenience method corresponding to String.valueOf(char). Returns a one
378 * or two char string containing the UTF-32 value in UTF16 format. If a
379 * validity check is required, use
380 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
381 * on char32 before calling.
382 * @param char32 the input character.
383 * @return string value of char32 in UTF16 format
384 * @exception IllegalArgumentException thrown if char32 is a invalid
385 * codepoint.
386 * @stable ICU 2.1
387 */
388 public static String valueOf(int char32)
389 {
390 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
391 throw new IllegalArgumentException("Illegal codepoint");
392 }
393 return toString(char32);
402 * @param char32 value to append.
403 * @return the updated StringBuffer
404 * @exception IllegalArgumentException thrown when char32 does not lie
405 * within the range of the Unicode codepoints
406 * @stable ICU 2.1
407 */
408 public static StringBuffer append(StringBuffer target, int char32)
409 {
410 // Check for irregular values
411 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
412 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
413 }
414
415 // Write the UTF-16 values
416 if (char32 >= SUPPLEMENTARY_MIN_VALUE)
417 {
418 target.append(getLeadSurrogate(char32));
419 target.append(getTrailSurrogate(char32));
420 }
421 else {
422 target.append((char)char32);
423 }
424 return target;
425 }
426
427 //// for StringPrep
428 /**
429 * Shifts offset16 by the argument number of codepoints within a subarray.
430 * @param source char array
431 * @param start position of the subarray to be performed on
432 * @param limit position of the subarray to be performed on
433 * @param offset16 UTF16 position to shift relative to start
434 * @param shift32 number of codepoints to shift
435 * @return new shifted offset16 relative to start
436 * @exception IndexOutOfBoundsException if the new offset16 is out of
437 * bounds with respect to the subarray or the subarray bounds
438 * are out of range.
439 * @stable ICU 2.1
440 */
441 public static int moveCodePointOffset(char source[], int start, int limit,
442 int offset16, int shift32)
443 {
444 int size = source.length;
445 int count;
446 char ch;
447 int result = offset16 + start;
448 if (start<0 || limit<start) {
449 throw new StringIndexOutOfBoundsException(start);
450 }
451 if (limit>size) {
452 throw new StringIndexOutOfBoundsException(limit);
453 }
454 if (offset16<0 || result>limit) {
455 throw new StringIndexOutOfBoundsException(offset16);
456 }
457 if (shift32 > 0 ) {
458 if (shift32 + result > size) {
459 throw new StringIndexOutOfBoundsException(result);
460 }
461 count = shift32;
462 while (result < limit && count > 0)
463 {
464 ch = source[result];
465 if (isLeadSurrogate(ch) && (result+1 < limit) &&
466 isTrailSurrogate(source[result+1])) {
467 result ++;
468 }
469 count --;
470 result ++;
471 }
472 } else {
473 if (result + shift32 < start) {
474 throw new StringIndexOutOfBoundsException(result);
475 }
476 for (count=-shift32; count>0; count--) {
477 result--;
478 if (result<start) {
479 break;
480 }
481 ch = source[result];
482 if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
483 result--;
484 }
485 }
486 }
487 if (count != 0) {
488 throw new StringIndexOutOfBoundsException(shift32);
489 }
490 result -= start;
491 return result;
492 }
493
494 // private data members -------------------------------------------------
495
496 /**
497 * Shift value for lead surrogate to form a supplementary character.
498 */
499 private static final int LEAD_SURROGATE_SHIFT_ = 10;
500
501 /**
502 * Mask to retrieve the significant value from a trail surrogate.
510 LEAD_SURROGATE_MIN_VALUE -
511 (SUPPLEMENTARY_MIN_VALUE
512 >> LEAD_SURROGATE_SHIFT_);
513
514 // private methods ------------------------------------------------------
515
516 /**
517 * <p>Converts argument code point and returns a String object representing
518 * the code point's value in UTF16 format.
519 * <p>This method does not check for the validity of the codepoint, the
520 * results are not guaranteed if a invalid codepoint is passed as
521 * argument.
522 * <p>The result is a string whose length is 1 for non-supplementary code
523 * points, 2 otherwise.
524 * @param ch code point
525 * @return string representation of the code point
526 */
527 private static String toString(int ch)
528 {
529 if (ch < SUPPLEMENTARY_MIN_VALUE) {
530 return String.valueOf((char)ch);
531 }
532
533 StringBuilder result = new StringBuilder();
534 result.append(getLeadSurrogate(ch));
535 result.append(getTrailSurrogate(ch));
536 return result.toString();
537 }
538 }
|
1 /*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 /**
26 *******************************************************************************
27 * Copyright (C) 1996-2014, International Business Machines Corporation and
28 * others. All Rights Reserved.
29 *******************************************************************************
30 */
31
32 package sun.text.normalizer;
33
34 /**
35 * <p>Standalone utility class providing UTF16 character conversions and
36 * indexing conversions.
37 * <p>Code that uses strings alone rarely need modification.
38 * By design, UTF-16 does not allow overlap, so searching for strings is a safe
39 * operation. Similarly, concatenation is always safe. Substringing is safe if
40 * the start and end are both on UTF-32 boundaries. In normal code, the values
41 * for start and end are on those boundaries, since they arose from operations
42 * like searching. If not, the nearest UTF-32 boundaries can be determined
43 * using <code>bounds()</code>.
44 * <strong>Examples:</strong>
45 * <p>The following examples illustrate use of some of these methods.
46 * <pre>{@code
47 * // iteration forwards: Original
48 * for (int i = 0; i < s.length(); ++i) {
49 * char ch = s.charAt(i);
50 * doSomethingWith(ch);
51 * }
52 *
53 * // iteration forwards: Changes for UTF-32
54 * int ch;
55 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
56 * ch = UTF16.charAt(s, i);
57 * doSomethingWith(ch);
58 * }
59 *
60 * // iteration backwards: Original
61 * for (int i = s.length() - 1; i >= 0; --i) {
62 * char ch = s.charAt(i);
63 * doSomethingWith(ch);
64 * }
65 *
66 * // iteration backwards: Changes for UTF-32
67 * int ch;
68 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
69 * ch = UTF16.charAt(s, i);
70 * doSomethingWith(ch);
71 * }
72 * }</pre>
73 * <strong>Notes:</strong>
74 * <ul>
75 * <li>
76 * <strong>Naming:</strong> For clarity, High and Low surrogates are called
77 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
78 * sense of their ordering in a string. <code>offset16</code> and
79 * <code>offset32</code> are used to distinguish offsets to UTF-16
80 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
81 * used to contain UTF-32 characters, as opposed to <code>char16</code>,
82 * which is a UTF-16 code unit.
83 * </li>
84 * <li>
85 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
86 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
87 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
88 * back if and only if <code>bounds(string, offset16) != TRAIL</code>.
89 * </li>
139 /**
140 * Trail surrogate minimum value
141 * @stable ICU 2.1
142 */
143 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
144 /**
145 * Lead surrogate maximum value
146 * @stable ICU 2.1
147 */
148 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
149 /**
150 * Trail surrogate maximum value
151 * @stable ICU 2.1
152 */
153 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
154 /**
155 * Surrogate minimum value
156 * @stable ICU 2.1
157 */
158 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
159 /**
160 * Lead surrogate bitmask
161 */
162 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
163 /**
164 * Trail surrogate bitmask
165 */
166 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
167 /**
168 * Surrogate bitmask
169 */
170 private static final int SURROGATE_BITMASK = 0xFFFFF800;
171 /**
172 * Lead surrogate bits
173 */
174 private static final int LEAD_SURROGATE_BITS = 0xD800;
175 /**
176 * Trail surrogate bits
177 */
178 private static final int TRAIL_SURROGATE_BITS = 0xDC00;
179 /**
180 * Surrogate bits
181 */
182 private static final int SURROGATE_BITS = 0xD800;
183
184 // constructor --------------------------------------------------------
185
186 // /CLOVER:OFF
187 /**
188 * Prevent instance from being created.
189 */
190 private UTF16() {
191 }
192
193 // /CLOVER:ON
194 // public method ------------------------------------------------------
195
196 /**
197 * Extract a single UTF-32 value from a string.
198 * Used when iterating forwards or backwards (with
199 * <code>UTF16.getCharCount()</code>, as well as random access. If a
200 * validity check is required, use
201 * <code><a href="../lang/UCharacter.html#isLegal(char)">
202 * UCharacter.isLegal()</a></code> on the return value.
203 * If the char retrieved is part of a surrogate pair, its supplementary
204 * character will be returned. If a complete supplementary character is
205 * not found the incomplete character will be returned
206 * @param source array of UTF-16 chars
207 * @param offset16 UTF-16 offset to the start of the character.
208 * @return UTF-32 value for the UTF-32 value that contains the char at
209 * offset16. The boundaries of that codepoint are the same as in
210 * <code>bounds32()</code>.
211 * @exception IndexOutOfBoundsException thrown if offset16 is out of
212 * bounds.
213 * @stable ICU 2.1
234 if (source.length() != offset16) {
235 char trail = source.charAt(offset16);
236 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
237 return UCharacterProperty.getRawSupplementary(single, trail);
238 }
239 }
240 } else {
241 --offset16;
242 if (offset16 >= 0) {
243 // single is a trail surrogate so
244 char lead = source.charAt(offset16);
245 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
246 return UCharacterProperty.getRawSupplementary(lead, single);
247 }
248 }
249 }
250 return single; // return unmatched surrogate
251 }
252
253 /**
254 * Extract a single UTF-32 value from a string.
255 * Used when iterating forwards or backwards (with
256 * <code>UTF16.getCharCount()</code>, as well as random access. If a
257 * validity check is required, use
258 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
259 * </a></code> on the return value.
260 * If the char retrieved is part of a surrogate pair, its supplementary
261 * character will be returned. If a complete supplementary character is
262 * not found the incomplete character will be returned
263 * @param source array of UTF-16 chars
264 * @param offset16 UTF-16 offset to the start of the character.
265 * @return UTF-32 value for the UTF-32 value that contains the char at
266 * offset16. The boundaries of that codepoint are the same as in
267 * <code>bounds32()</code>.
268 * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
269 * @stable ICU 2.1
270 */
271 public static int charAt(CharSequence source, int offset16) {
272 char single = source.charAt(offset16);
273 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
274 return single;
275 }
276 return _charAt(source, offset16, single);
277 }
278
279 private static int _charAt(CharSequence source, int offset16, char single) {
280 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
281 return single;
282 }
283
284 // Convert the UTF-16 surrogate pair if necessary.
285 // For simplicity in usage, and because the frequency of pairs is
286 // low, look both directions.
287
288 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
289 ++offset16;
290 if (source.length() != offset16) {
291 char trail = source.charAt(offset16);
292 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
293 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
294 return UCharacterProperty.getRawSupplementary(single, trail);
295 }
296 }
297 } else {
298 --offset16;
299 if (offset16 >= 0) {
300 // single is a trail surrogate so
301 char lead = source.charAt(offset16);
302 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
303 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
304 return UCharacterProperty.getRawSupplementary(lead, single);
305 }
306 }
307 }
308 return single; // return unmatched surrogate
309 }
310
311 /**
312 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
313 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
314 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
315 * </a></code>
316 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
317 * character will be returned. If a complete supplementary character is not found the incomplete
318 * character will be returned
319 *
320 * @param source Array of UTF-16 chars
321 * @param start Offset to substring in the source array for analyzing
322 * @param limit Offset to substring in the source array for analyzing
323 * @param offset16 UTF-16 offset relative to start
324 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
325 * of that codepoint are the same as in <code>bounds32()</code>.
326 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
327 * @stable ICU 2.1
328 */
329 public static int charAt(char source[], int start, int limit, int offset16) {
330 offset16 += start;
331 if (offset16 < start || offset16 >= limit) {
332 throw new ArrayIndexOutOfBoundsException(offset16);
333 }
334
335 char single = source[offset16];
336 if (!isSurrogate(single)) {
337 return single;
338 }
339
340 // Convert the UTF-16 surrogate pair if necessary.
341 // For simplicity in usage, and because the frequency of pairs is
342 // low, look both directions.
343 if (single <= LEAD_SURROGATE_MAX_VALUE) {
344 offset16++;
345 if (offset16 >= limit) {
346 return single;
347 }
348 char trail = source[offset16];
349 if (isTrailSurrogate(trail)) {
350 return UCharacterProperty.getRawSupplementary(single, trail);
351 }
352 }
353 else { // isTrailSurrogate(single), so
354 if (offset16 == start) {
355 return single;
356 }
357 offset16--;
358 char lead = source[offset16];
359 if (isLeadSurrogate(lead))
360 return UCharacterProperty.getRawSupplementary(lead, single);
361 }
362 return single; // return unmatched surrogate
363 }
364
365 /**
366 * Determines how many chars this char32 requires.
367 * If a validity check is required, use <code>
368 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
369 * char32 before calling.
370 * @param char32 the input codepoint.
371 * @return 2 if is in supplementary space, otherwise 1.
372 * @stable ICU 2.1
373 */
374 public static int getCharCount(int char32)
375 {
376 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
377 return 1;
378 }
379 return 2;
380 }
381
382 /**
383 * Determines whether the code value is a surrogate.
384 * @param char16 the input character.
385 * @return true if the input character is a surrogate.
386 * @stable ICU 2.1
387 */
388 public static boolean isSurrogate(char char16)
389 {
390 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
391 }
392
393 /**
394 * Determines whether the character is a trail surrogate.
395 * @param char16 the input character.
396 * @return true if the input character is a trail surrogate.
397 * @stable ICU 2.1
398 */
399 public static boolean isTrailSurrogate(char char16)
400 {
401 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
402 }
403
404 /**
405 * Determines whether the character is a lead surrogate.
406 * @param char16 the input character.
407 * @return true if the input character is a lead surrogate
408 * @stable ICU 2.1
409 */
410 public static boolean isLeadSurrogate(char char16)
411 {
412 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
413 }
414
415 /**
416 * Returns the lead surrogate.
417 * If a validity check is required, use
418 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
419 * on char32 before calling.
420 * @param char32 the input character.
421 * @return lead surrogate if the getCharCount(ch) is 2; <br>
422 * and 0 otherwise (note: 0 is not a valid lead surrogate).
423 * @stable ICU 2.1
424 */
425 public static char getLeadSurrogate(int char32)
426 {
427 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
428 return (char)(LEAD_SURROGATE_OFFSET_ +
429 (char32 >> LEAD_SURROGATE_SHIFT_));
430 }
431
432 return 0;
433 }
434
435 /**
436 * Returns the trail surrogate.
437 * If a validity check is required, use
438 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
439 * on char32 before calling.
440 * @param char32 the input character.
441 * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
442 * the character itself
443 * @stable ICU 2.1
444 */
445 public static char getTrailSurrogate(int char32)
446 {
447 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
448 return (char)(TRAIL_SURROGATE_MIN_VALUE +
449 (char32 & TRAIL_SURROGATE_MASK_));
450 }
451
452 return (char) char32;
453 }
454
455 /**
456 * Convenience method corresponding to String.valueOf(char). Returns a one
457 * or two char string containing the UTF-32 value in UTF16 format. If a
458 * validity check is required, use
459 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
460 * on char32 before calling.
461 * @param char32 the input character.
462 * @return string value of char32 in UTF16 format
463 * @exception IllegalArgumentException thrown if char32 is a invalid
464 * codepoint.
465 * @stable ICU 2.1
466 */
467 public static String valueOf(int char32)
468 {
469 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
470 throw new IllegalArgumentException("Illegal codepoint");
471 }
472 return toString(char32);
481 * @param char32 value to append.
482 * @return the updated StringBuffer
483 * @exception IllegalArgumentException thrown when char32 does not lie
484 * within the range of the Unicode codepoints
485 * @stable ICU 2.1
486 */
487 public static StringBuffer append(StringBuffer target, int char32)
488 {
489 // Check for irregular values
490 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
491 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
492 }
493
494 // Write the UTF-16 values
495 if (char32 >= SUPPLEMENTARY_MIN_VALUE)
496 {
497 target.append(getLeadSurrogate(char32));
498 target.append(getTrailSurrogate(char32));
499 }
500 else {
501 target.append((char) char32);
502 }
503 return target;
504 }
505
506 /**
507 * Shifts offset16 by the argument number of codepoints within a subarray.
508 * @param source char array
509 * @param start position of the subarray to be performed on
510 * @param limit position of the subarray to be performed on
511 * @param offset16 UTF16 position to shift relative to start
512 * @param shift32 number of codepoints to shift
513 * @return new shifted offset16 relative to start
514 * @exception IndexOutOfBoundsException if the new offset16 is out of
515 * bounds with respect to the subarray or the subarray bounds
516 * are out of range.
517 * @stable ICU 2.1
518 */
519 public static int moveCodePointOffset(char source[], int start, int limit,
520 int offset16, int shift32)
521 {
522 int size = source.length;
523 int count;
524 char ch;
525 int result = offset16 + start;
526 if (start < 0 || limit < start) {
527 throw new StringIndexOutOfBoundsException(start);
528 }
529 if (limit > size) {
530 throw new StringIndexOutOfBoundsException(limit);
531 }
532 if (offset16 < 0 || result > limit) {
533 throw new StringIndexOutOfBoundsException(offset16);
534 }
535 if (shift32 > 0) {
536 if (shift32 + result > size) {
537 throw new StringIndexOutOfBoundsException(result);
538 }
539 count = shift32;
540 while (result < limit && count > 0)
541 {
542 ch = source[result];
543 if (isLeadSurrogate(ch) && (result + 1 < limit) &&
544 isTrailSurrogate(source[result + 1])) {
545 result++;
546 }
547 count--;
548 result++;
549 }
550 } else {
551 if (result + shift32 < start) {
552 throw new StringIndexOutOfBoundsException(result);
553 }
554 for (count = -shift32; count > 0; count--) {
555 result--;
556 if (result < start) {
557 break;
558 }
559 ch = source[result];
560 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
561 result--;
562 }
563 }
564 }
565 if (count != 0) {
566 throw new StringIndexOutOfBoundsException(shift32);
567 }
568 result -= start;
569 return result;
570 }
571
572 // private data members -------------------------------------------------
573
574 /**
575 * Shift value for lead surrogate to form a supplementary character.
576 */
577 private static final int LEAD_SURROGATE_SHIFT_ = 10;
578
579 /**
580 * Mask to retrieve the significant value from a trail surrogate.
588 LEAD_SURROGATE_MIN_VALUE -
589 (SUPPLEMENTARY_MIN_VALUE
590 >> LEAD_SURROGATE_SHIFT_);
591
592 // private methods ------------------------------------------------------
593
594 /**
595 * <p>Converts argument code point and returns a String object representing
596 * the code point's value in UTF16 format.
597 * <p>This method does not check for the validity of the codepoint, the
598 * results are not guaranteed if a invalid codepoint is passed as
599 * argument.
600 * <p>The result is a string whose length is 1 for non-supplementary code
601 * points, 2 otherwise.
602 * @param ch code point
603 * @return string representation of the code point
604 */
605 private static String toString(int ch)
606 {
607 if (ch < SUPPLEMENTARY_MIN_VALUE) {
608 return String.valueOf((char) ch);
609 }
610
611 StringBuilder result = new StringBuilder();
612 result.append(getLeadSurrogate(ch));
613 result.append(getTrailSurrogate(ch));
614 return result.toString();
615 }
616 }
|