< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java

Print this page

        

*** 53,72 **** * Characters with accents or other adornments can be encoded in * several different ways in Unicode. For example, take the character A-acute. * In Unicode, this can be encoded as a single character (the * "composed" form): * ! * <p> * 00C1 LATIN CAPITAL LETTER A WITH ACUTE ! * </p> * * or as two separate characters (the "decomposed" form): * ! * <p> * 0041 LATIN CAPITAL LETTER A * 0301 COMBINING ACUTE ACCENT ! * </p> * * To a user of your program, however, both of these sequences should be * treated as the same "user-level" character "A with acute accent". When you * are searching or comparing text, you must ensure that these two sequences are * treated equivalently. In addition, you must handle characters with more than --- 53,72 ---- * Characters with accents or other adornments can be encoded in * several different ways in Unicode. For example, take the character A-acute. * In Unicode, this can be encoded as a single character (the * "composed" form): * ! * <pre> * 00C1 LATIN CAPITAL LETTER A WITH ACUTE ! * </pre> * * or as two separate characters (the "decomposed" form): * ! * <pre> * 0041 LATIN CAPITAL LETTER A * 0301 COMBINING ACUTE ACCENT ! * </pre> * * To a user of your program, however, both of these sequences should be * treated as the same "user-level" character "A with acute accent". When you * are searching or comparing text, you must ensure that these two sequences are * treated equivalently. In addition, you must handle characters with more than
*** 74,94 **** * significant, while in other cases accent sequences in different orders are * really equivalent. * * Similarly, the string "ffi" can be encoded as three separate letters: * ! * <p> * 0066 LATIN SMALL LETTER F * 0066 LATIN SMALL LETTER F * 0069 LATIN SMALL LETTER I ! * </p> * * or as the single character * ! * <p> * FB03 LATIN SMALL LIGATURE FFI ! * </p> * * The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility * with existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions --- 74,94 ---- * significant, while in other cases accent sequences in different orders are * really equivalent. * * Similarly, the string "ffi" can be encoded as three separate letters: * ! * <pre> * 0066 LATIN SMALL LETTER F * 0066 LATIN SMALL LETTER F * 0069 LATIN SMALL LETTER I ! * </pre> * * or as the single character * ! * <pre> * FB03 LATIN SMALL LIGATURE FFI ! * </pre> * * The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility * with existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions
*** 553,568 **** //------------------------------------------------------------------------- // Constructors //------------------------------------------------------------------------- /** ! * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of a given string. * <p> ! * The <tt>options</tt> parameter specifies which optional ! * <tt>Normalizer</tt> features are to be enabled for this object. ! * <p> * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * --- 553,568 ---- //------------------------------------------------------------------------- // Constructors //------------------------------------------------------------------------- /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of a given string. * <p> ! * The {@code options} parameter specifies which optional ! * {@code Normalizer} features are to be enabled for this object. ! * * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. *
*** 577,602 **** this.mode = mode; this.options=opt; } /** ! * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of the given text. ! * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. */ public NormalizerBase(CharacterIterator iter, Mode mode) { this(iter, mode, UNICODE_LATEST); } /** ! * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of the given text. ! * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * --- 577,602 ---- this.mode = mode; this.options=opt; } /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of the given text. ! * * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. */ public NormalizerBase(CharacterIterator iter, Mode mode) { this(iter, mode, UNICODE_LATEST); } /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of the given text. ! * * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. *
*** 613,629 **** this.mode = mode; this.options = opt; } /** ! * Clones this <tt>Normalizer</tt> object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. * However, the text storage underlying ! * the <tt>CharacterIterator</tt> is not duplicated unless the ! * iterator's <tt>clone</tt> method does so. * @stable ICU 2.8 */ public Object clone() { try { NormalizerBase copy = (NormalizerBase) super.clone(); --- 613,629 ---- this.mode = mode; this.options = opt; } /** ! * Clones this {@code Normalizer} object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. * However, the text storage underlying ! * the {@code CharacterIterator} is not duplicated unless the ! * iterator's {@code clone} method does so. * @stable ICU 2.8 */ public Object clone() { try { NormalizerBase copy = (NormalizerBase) super.clone();
*** 789,799 **** //------------------------------------------------------------------------- // Iteration API //------------------------------------------------------------------------- /** ! * Return the current character in the normalized text-> * @return The codepoint as an int * @stable ICU 2.8 */ public int current() { if(bufferPos<bufferLimit || nextNormalize()) { --- 789,799 ---- //------------------------------------------------------------------------- // Iteration API //------------------------------------------------------------------------- /** ! * Return the current character in the normalized text. * @return The codepoint as an int * @stable ICU 2.8 */ public int current() { if(bufferPos<bufferLimit || nextNormalize()) {
*** 870,883 **** * <p> * <b>Note:</b> This method sets the position in the <em>input</em> text, * while {@link #next} and {@link #previous} iterate through characters * in the normalized <em>output</em>. This means that there is not * necessarily a one-to-one correspondence between characters returned ! * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and ! * returned from <tt>setIndex</tt> and {@link #getIndex}. ! * <p> ! * @param index the desired index in the input text-> * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than --- 870,883 ---- * <p> * <b>Note:</b> This method sets the position in the <em>input</em> text, * while {@link #next} and {@link #previous} iterate through characters * in the normalized <em>output</em>. This means that there is not * necessarily a one-to-one correspondence between characters returned ! * by {@code next} and {@code previous} and the indices passed to and ! * returned from {@code setIndex} and {@link #getIndex}. ! * ! * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than
*** 892,903 **** return current(); } /** * Retrieve the index of the start of the input text. This is the begin ! * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the ! * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex */ @Deprecated --- 892,903 ---- return current(); } /** * Retrieve the index of the start of the input text. This is the begin ! * index of the {@code CharacterIterator} or the start (i.e. 0) of the ! * {@code String} over which this {@code Normalizer} is iterating * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex */ @Deprecated
*** 905,916 **** return 0; } /** * Retrieve the index of the end of the input text. This is the end index ! * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> ! * over which this <tt>Normalizer</tt> is iterating * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex */ @Deprecated --- 905,916 ---- return 0; } /** * Retrieve the index of the end of the input text. This is the end index ! * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code Normalizer} is iterating * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex */ @Deprecated
*** 925,937 **** * the input text that corresponds to a given normalized output character. * <p> * <b>Note:</b> This method sets the position in the <em>input</em>, while * {@link #next} and {@link #previous} iterate through characters in the * <em>output</em>. This means that there is not necessarily a one-to-one ! * correspondence between characters returned by <tt>next</tt> and ! * <tt>previous</tt> and the indices passed to and returned from ! * <tt>setIndex</tt> and {@link #getIndex}. * @return The current iteration position * @stable ICU 2.8 */ public int getIndex() { if(bufferPos<bufferLimit) { --- 925,937 ---- * the input text that corresponds to a given normalized output character. * <p> * <b>Note:</b> This method sets the position in the <em>input</em>, while * {@link #next} and {@link #previous} iterate through characters in the * <em>output</em>. This means that there is not necessarily a one-to-one ! * correspondence between characters returned by {@code next} and ! * {@code previous} and the indices passed to and returned from ! * {@code setIndex} and {@link #getIndex}. * @return The current iteration position * @stable ICU 2.8 */ public int getIndex() { if(bufferPos<bufferLimit) {
*** 940,952 **** return nextIndex; } } /** ! * Retrieve the index of the end of the input text-> This is the end index ! * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> ! * over which this <tt>Normalizer</tt> is iterating * @return The current iteration position * @stable ICU 2.8 */ public int endIndex() { return text.getLength(); --- 940,952 ---- return nextIndex; } } /** ! * Retrieve the index of the end of the input text. This is the end index ! * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code Normalizer} is iterating * @return The current iteration position * @stable ICU 2.8 */ public int endIndex() { return text.getLength();
*** 961,973 **** * <b>Note:</b>If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, ! * {@link #last}, etc. after calling <tt>setMode</tt>. ! * <p> ! * @param newMode the new mode for this <tt>Normalizer</tt>. * The supported modes are: * <ul> * <li>{@link #COMPOSE} - Unicode canonical decompositiion * followed by canonical composition. * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion --- 961,973 ---- * <b>Note:</b>If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, ! * {@link #last}, etc. after calling {@code setMode}. ! * ! * @param newMode the new mode for this {@code Normalizer}. * The supported modes are: * <ul> * <li>{@link #COMPOSE} - Unicode canonical decompositiion * followed by canonical composition. * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
*** 983,1004 **** */ public void setMode(Mode newMode) { mode = newMode; } /** ! * Return the basic operation performed by this <tt>Normalizer</tt> * * @see #setMode * @stable ICU 2.8 */ public Mode getMode() { return mode; } /** ! * Set the input text over which this <tt>Normalizer</tt> will iterate. ! * The iteration position is set to the beginning of the input text-> * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) { --- 983,1004 ---- */ public void setMode(Mode newMode) { mode = newMode; } /** ! * Return the basic operation performed by this {@code Normalizer} * * @see #setMode * @stable ICU 2.8 */ public Mode getMode() { return mode; } /** ! * Set the input text over which this {@code Normalizer} will iterate. ! * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) {
*** 1009,1020 **** text = newIter; reset(); } /** ! * Set the input text over which this <tt>Normalizer</tt> will iterate. ! * The iteration position is set to the beginning of the input text-> * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) { --- 1009,1020 ---- text = newIter; reset(); } /** ! * Set the input text over which this {@code Normalizer} will iterate. ! * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) {
*** 1569,1579 **** // public constructor and methods for java.text.Normalizer and // sun.text.Normalizer // /** ! * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of a given string. * * @param str The string to be normalized. The normalization * will start at the beginning of the string. * --- 1569,1579 ---- // public constructor and methods for java.text.Normalizer and // sun.text.Normalizer // /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of a given string. * * @param str The string to be normalized. The normalization * will start at the beginning of the string. *
*** 1644,1654 **** * For NFD, NFKD, and FCD, both functions work exactly the same. * For NFC and NFKC where quickCheck may return "maybe", this function will * perform further tests to arrive at a true/false result. * @param str the input string to be checked to see if it is normalized * @param form the normalization form - * @param options the optional features to be enabled. */ public static boolean isNormalized(String str, Normalizer.Form form) { return isNormalized(str, form, UNICODE_LATEST); } --- 1644,1653 ----
< prev index next >