< prev index next >
src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java
Print this page
*** 53,72 ****
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
! * <p>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
! * </p>
*
* or as two separate characters (the "decomposed" form):
*
! * <p>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
! * </p>
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you
* are searching or comparing text, you must ensure that these two sequences are
* treated equivalently. In addition, you must handle characters with more than
--- 53,72 ----
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
! * <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
! * </pre>
*
* or as two separate characters (the "decomposed" form):
*
! * <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
! * </pre>
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you
* are searching or comparing text, you must ensure that these two sequences are
* treated equivalently. In addition, you must handle characters with more than
*** 74,94 ****
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
! * <p>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
! * </p>
*
* or as the single character
*
! * <p>
* FB03 LATIN SMALL LIGATURE FFI
! * </p>
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
--- 74,94 ----
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
! * <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
! * </pre>
*
* or as the single character
*
! * <pre>
* FB03 LATIN SMALL LIGATURE FFI
! * </pre>
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
*** 553,568 ****
//-------------------------------------------------------------------------
// Constructors
//-------------------------------------------------------------------------
/**
! * Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of a given string.
* <p>
! * The <tt>options</tt> parameter specifies which optional
! * <tt>Normalizer</tt> features are to be enabled for this object.
! * <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
--- 553,568 ----
//-------------------------------------------------------------------------
// Constructors
//-------------------------------------------------------------------------
/**
! * Creates a new {@code Normalizer} object for iterating over the
* normalized form of a given string.
* <p>
! * The {@code options} parameter specifies which optional
! * {@code Normalizer} features are to be enabled for this object.
! *
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
*** 577,602 ****
this.mode = mode;
this.options=opt;
}
/**
! * Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of the given text.
! * <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*/
public NormalizerBase(CharacterIterator iter, Mode mode) {
this(iter, mode, UNICODE_LATEST);
}
/**
! * Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of the given text.
! * <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
--- 577,602 ----
this.mode = mode;
this.options=opt;
}
/**
! * Creates a new {@code Normalizer} object for iterating over the
* normalized form of the given text.
! *
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*/
public NormalizerBase(CharacterIterator iter, Mode mode) {
this(iter, mode, UNICODE_LATEST);
}
/**
! * Creates a new {@code Normalizer} object for iterating over the
* normalized form of the given text.
! *
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
*** 613,629 ****
this.mode = mode;
this.options = opt;
}
/**
! * Clones this <tt>Normalizer</tt> object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
* However, the text storage underlying
! * the <tt>CharacterIterator</tt> is not duplicated unless the
! * iterator's <tt>clone</tt> method does so.
* @stable ICU 2.8
*/
public Object clone() {
try {
NormalizerBase copy = (NormalizerBase) super.clone();
--- 613,629 ----
this.mode = mode;
this.options = opt;
}
/**
! * Clones this {@code Normalizer} object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
* However, the text storage underlying
! * the {@code CharacterIterator} is not duplicated unless the
! * iterator's {@code clone} method does so.
* @stable ICU 2.8
*/
public Object clone() {
try {
NormalizerBase copy = (NormalizerBase) super.clone();
*** 789,799 ****
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
! * Return the current character in the normalized text->
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int current() {
if(bufferPos<bufferLimit || nextNormalize()) {
--- 789,799 ----
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
! * Return the current character in the normalized text.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int current() {
if(bufferPos<bufferLimit || nextNormalize()) {
*** 870,883 ****
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text,
* while {@link #next} and {@link #previous} iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
! * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
! * returned from <tt>setIndex</tt> and {@link #getIndex}.
! * <p>
! * @param index the desired index in the input text->
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
--- 870,883 ----
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text,
* while {@link #next} and {@link #previous} iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
! * by {@code next} and {@code previous} and the indices passed to and
! * returned from {@code setIndex} and {@link #getIndex}.
! *
! * @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
*** 892,903 ****
return current();
}
/**
* Retrieve the index of the start of the input text. This is the begin
! * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
! * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
*/
@Deprecated
--- 892,903 ----
return current();
}
/**
* Retrieve the index of the start of the input text. This is the begin
! * index of the {@code CharacterIterator} or the start (i.e. 0) of the
! * {@code String} over which this {@code Normalizer} is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
*/
@Deprecated
*** 905,916 ****
return 0;
}
/**
* Retrieve the index of the end of the input text. This is the end index
! * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
! * over which this <tt>Normalizer</tt> is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
*/
@Deprecated
--- 905,916 ----
return 0;
}
/**
* Retrieve the index of the end of the input text. This is the end index
! * of the {@code CharacterIterator} or the length of the {@code String}
! * over which this {@code Normalizer} is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
*/
@Deprecated
*** 925,937 ****
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
! * correspondence between characters returned by <tt>next</tt> and
! * <tt>previous</tt> and the indices passed to and returned from
! * <tt>setIndex</tt> and {@link #getIndex}.
* @return The current iteration position
* @stable ICU 2.8
*/
public int getIndex() {
if(bufferPos<bufferLimit) {
--- 925,937 ----
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
! * correspondence between characters returned by {@code next} and
! * {@code previous} and the indices passed to and returned from
! * {@code setIndex} and {@link #getIndex}.
* @return The current iteration position
* @stable ICU 2.8
*/
public int getIndex() {
if(bufferPos<bufferLimit) {
*** 940,952 ****
return nextIndex;
}
}
/**
! * Retrieve the index of the end of the input text-> This is the end index
! * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
! * over which this <tt>Normalizer</tt> is iterating
* @return The current iteration position
* @stable ICU 2.8
*/
public int endIndex() {
return text.getLength();
--- 940,952 ----
return nextIndex;
}
}
/**
! * Retrieve the index of the end of the input text. This is the end index
! * of the {@code CharacterIterator} or the length of the {@code String}
! * over which this {@code Normalizer} is iterating
* @return The current iteration position
* @stable ICU 2.8
*/
public int endIndex() {
return text.getLength();
*** 961,973 ****
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next} and {@link #previous} may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
! * {@link #last}, etc. after calling <tt>setMode</tt>.
! * <p>
! * @param newMode the new mode for this <tt>Normalizer</tt>.
* The supported modes are:
* <ul>
* <li>{@link #COMPOSE} - Unicode canonical decompositiion
* followed by canonical composition.
* <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
--- 961,973 ----
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next} and {@link #previous} may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
! * {@link #last}, etc. after calling {@code setMode}.
! *
! * @param newMode the new mode for this {@code Normalizer}.
* The supported modes are:
* <ul>
* <li>{@link #COMPOSE} - Unicode canonical decompositiion
* followed by canonical composition.
* <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
*** 983,1004 ****
*/
public void setMode(Mode newMode) {
mode = newMode;
}
/**
! * Return the basic operation performed by this <tt>Normalizer</tt>
*
* @see #setMode
* @stable ICU 2.8
*/
public Mode getMode() {
return mode;
}
/**
! * Set the input text over which this <tt>Normalizer</tt> will iterate.
! * The iteration position is set to the beginning of the input text->
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
--- 983,1004 ----
*/
public void setMode(Mode newMode) {
mode = newMode;
}
/**
! * Return the basic operation performed by this {@code Normalizer}
*
* @see #setMode
* @stable ICU 2.8
*/
public Mode getMode() {
return mode;
}
/**
! * Set the input text over which this {@code Normalizer} will iterate.
! * The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
*** 1009,1020 ****
text = newIter;
reset();
}
/**
! * Set the input text over which this <tt>Normalizer</tt> will iterate.
! * The iteration position is set to the beginning of the input text->
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
--- 1009,1020 ----
text = newIter;
reset();
}
/**
! * Set the input text over which this {@code Normalizer} will iterate.
! * The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
*** 1569,1579 ****
// public constructor and methods for java.text.Normalizer and
// sun.text.Normalizer
//
/**
! * Creates a new <tt>Normalizer</tt> object for iterating over the
* normalized form of a given string.
*
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
--- 1569,1579 ----
// public constructor and methods for java.text.Normalizer and
// sun.text.Normalizer
//
/**
! * Creates a new {@code Normalizer} object for iterating over the
* normalized form of a given string.
*
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
*** 1644,1654 ****
* For NFD, NFKD, and FCD, both functions work exactly the same.
* For NFC and NFKC where quickCheck may return "maybe", this function will
* perform further tests to arrive at a true/false result.
* @param str the input string to be checked to see if it is normalized
* @param form the normalization form
- * @param options the optional features to be enabled.
*/
public static boolean isNormalized(String str, Normalizer.Form form) {
return isNormalized(str, form, UNICODE_LATEST);
}
--- 1644,1653 ----
< prev index next >