1 /*
   2  * Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * (C) Copyright Taligent, Inc. 1996,1997 - All Rights Reserved
  28  * (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved
  29  *
  30  *   The original version of this source code and documentation is copyrighted
  31  * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
  32  * materials are provided under terms of a License Agreement between Taligent
  33  * and Sun. This technology is protected by multiple US and International
  34  * patents. This notice and attribution to Taligent may not be removed.
  35  *   Taligent is a registered trademark of Taligent, Inc.
  36  *
  37  */
  38 
  39 package sun.util.locale.provider;
  40 /**
  41  * CollationRules contains the default en_US collation rules as a base
  42  * for building other collation tables.
  43  * <p>Note that decompositions are done before these rules are used,
  44  * so they do not have to contain accented characters, such as A-grave.
  45  * @see                RuleBasedCollator
  46  * @see                LocaleElements
  47  * @author             Helena Shih, Mark Davis
  48  */
  49 final class CollationRules {
  50     final static String DEFAULTRULES =
  51         "" // no FRENCH accent order by default, add in French Delta
  52         // IGNORABLES (up to first < character)
  53         // COMPLETELY IGNORE format characters
  54         + "='\u200B'=\u200C=\u200D=\u200E=\u200F"
  55         // Control Characters
  56         + "=\u0000 =\u0001 =\u0002 =\u0003 =\u0004" //null, .. eot
  57         + "=\u0005 =\u0006 =\u0007 =\u0008 ='\u0009'" //enq, ...
  58         + "='\u000b' =\u000e" //vt,, so
  59         + "=\u000f ='\u0010' =\u0011 =\u0012 =\u0013" //si, dle, dc1, dc2, dc3
  60         + "=\u0014 =\u0015 =\u0016 =\u0017 =\u0018" //dc4, nak, syn, etb, can
  61         + "=\u0019 =\u001a =\u001b =\u001c =\u001d" //em, sub, esc, fs, gs
  62         + "=\u001e =\u001f =\u007f"                   //rs, us, del
  63         //....then the C1 Latin 1 reserved control codes
  64         + "=\u0080 =\u0081 =\u0082 =\u0083 =\u0084 =\u0085"
  65         + "=\u0086 =\u0087 =\u0088 =\u0089 =\u008a =\u008b"
  66         + "=\u008c =\u008d =\u008e =\u008f =\u0090 =\u0091"
  67         + "=\u0092 =\u0093 =\u0094 =\u0095 =\u0096 =\u0097"
  68         + "=\u0098 =\u0099 =\u009a =\u009b =\u009c =\u009d"
  69         + "=\u009e =\u009f"
  70         // IGNORE except for secondary, tertiary difference
  71         // Spaces
  72         + ";'\u0020';'\u00A0'"                  // spaces
  73         + ";'\u2000';'\u2001';'\u2002';'\u2003';'\u2004'"  // spaces
  74         + ";'\u2005';'\u2006';'\u2007';'\u2008';'\u2009'"  // spaces
  75         + ";'\u200A';'\u3000';'\uFEFF'"                // spaces
  76         + ";'\r' ;'\t' ;'\n';'\f';'\u000b'"  // whitespace
  77 
  78         // Non-spacing accents
  79 
  80         + ";\u0301"          // non-spacing acute accent
  81         + ";\u0300"          // non-spacing grave accent
  82         + ";\u0306"          // non-spacing breve accent
  83         + ";\u0302"          // non-spacing circumflex accent
  84         + ";\u030c"          // non-spacing caron/hacek accent
  85         + ";\u030a"          // non-spacing ring above accent
  86         + ";\u030d"          // non-spacing vertical line above
  87         + ";\u0308"          // non-spacing diaeresis accent
  88         + ";\u030b"          // non-spacing double acute accent
  89         + ";\u0303"          // non-spacing tilde accent
  90         + ";\u0307"          // non-spacing dot above/overdot accent
  91         + ";\u0304"          // non-spacing macron accent
  92         + ";\u0337"          // non-spacing short slash overlay (overstruck diacritic)
  93         + ";\u0327"          // non-spacing cedilla accent
  94         + ";\u0328"          // non-spacing ogonek accent
  95         + ";\u0323"          // non-spacing dot-below/underdot accent
  96         + ";\u0332"          // non-spacing underscore/underline accent
  97         // with the rest of the general diacritical marks in binary order
  98         + ";\u0305"          // non-spacing overscore/overline
  99         + ";\u0309"          // non-spacing hook above
 100         + ";\u030e"          // non-spacing double vertical line above
 101         + ";\u030f"          // non-spacing double grave
 102         + ";\u0310"          // non-spacing chandrabindu
 103         + ";\u0311"          // non-spacing inverted breve
 104         + ";\u0312"          // non-spacing turned comma above/cedilla above
 105         + ";\u0313"          // non-spacing comma above
 106         + ";\u0314"          // non-spacing reversed comma above
 107         + ";\u0315"          // non-spacing comma above right
 108         + ";\u0316"          // non-spacing grave below
 109         + ";\u0317"          // non-spacing acute below
 110         + ";\u0318"          // non-spacing left tack below
 111         + ";\u0319"          // non-spacing tack below
 112         + ";\u031a"          // non-spacing left angle above
 113         + ";\u031b"          // non-spacing horn
 114         + ";\u031c"          // non-spacing left half ring below
 115         + ";\u031d"          // non-spacing up tack below
 116         + ";\u031e"          // non-spacing down tack below
 117         + ";\u031f"          // non-spacing plus sign below
 118         + ";\u0320"          // non-spacing minus sign below
 119         + ";\u0321"          // non-spacing palatalized hook below
 120         + ";\u0322"          // non-spacing retroflex hook below
 121         + ";\u0324"          // non-spacing double dot below
 122         + ";\u0325"          // non-spacing ring below
 123         + ";\u0326"          // non-spacing comma below
 124         + ";\u0329"          // non-spacing vertical line below
 125         + ";\u032a"          // non-spacing bridge below
 126         + ";\u032b"          // non-spacing inverted double arch below
 127         + ";\u032c"          // non-spacing hacek below
 128         + ";\u032d"          // non-spacing circumflex below
 129         + ";\u032e"          // non-spacing breve below
 130         + ";\u032f"          // non-spacing inverted breve below
 131         + ";\u0330"          // non-spacing tilde below
 132         + ";\u0331"          // non-spacing macron below
 133         + ";\u0333"          // non-spacing double underscore
 134         + ";\u0334"          // non-spacing tilde overlay
 135         + ";\u0335"          // non-spacing short bar overlay
 136         + ";\u0336"          // non-spacing long bar overlay
 137         + ";\u0338"          // non-spacing long slash overlay
 138         + ";\u0339"          // non-spacing right half ring below
 139         + ";\u033a"          // non-spacing inverted bridge below
 140         + ";\u033b"          // non-spacing square below
 141         + ";\u033c"          // non-spacing seagull below
 142         + ";\u033d"          // non-spacing x above
 143         + ";\u033e"          // non-spacing vertical tilde
 144         + ";\u033f"          // non-spacing double overscore
 145         //+ ";\u0340"          // non-spacing grave tone mark == \u0300
 146         //+ ";\u0341"          // non-spacing acute tone mark == \u0301
 147         + ";\u0342;"
 148         //+ "\u0343;" // == \u0313
 149         + "\u0344;\u0345;\u0360;\u0361"    // newer
 150         + ";\u0483;\u0484;\u0485;\u0486"    // Cyrillic accents
 151 
 152         + ";\u20D0;\u20D1;\u20D2"           // symbol accents
 153         + ";\u20D3;\u20D4;\u20D5"           // symbol accents
 154         + ";\u20D6;\u20D7;\u20D8"           // symbol accents
 155         + ";\u20D9;\u20DA;\u20DB"           // symbol accents
 156         + ";\u20DC;\u20DD;\u20DE"           // symbol accents
 157         + ";\u20DF;\u20E0;\u20E1"           // symbol accents
 158 
 159         + ",'\u002D';\u00AD"                // dashes
 160         + ";\u2010;\u2011;\u2012"           // dashes
 161         + ";\u2013;\u2014;\u2015"           // dashes
 162         + ";\u2212"                         // dashes
 163 
 164         // other punctuation
 165 
 166         + "<'\u005f'"        // underline/underscore (spacing)
 167         + "<\u00af"          // overline or macron (spacing)
 168         + "<'\u002c'"        // comma (spacing)
 169         + "<'\u003b'"        // semicolon
 170         + "<'\u003a'"        // colon
 171         + "<'\u0021'"        // exclamation point
 172         + "<\u00a1"          // inverted exclamation point
 173         + "<'\u003f'"        // question mark
 174         + "<\u00bf"          // inverted question mark
 175         + "<'\u002f'"        // slash
 176         + "<'\u002e'"        // period/full stop
 177         + "<\u00b4"          // acute accent (spacing)
 178         + "<'\u0060'"        // grave accent (spacing)
 179         + "<'\u005e'"        // circumflex accent (spacing)
 180         + "<\u00a8"          // diaresis/umlaut accent (spacing)
 181         + "<'\u007e'"        // tilde accent (spacing)
 182         + "<\u00b7"          // middle dot (spacing)
 183         + "<\u00b8"          // cedilla accent (spacing)
 184         + "<'\u0027'"        // apostrophe
 185         + "<'\"'"            // quotation marks
 186         + "<\u00ab"          // left angle quotes
 187         + "<\u00bb"          // right angle quotes
 188         + "<'\u0028'"        // left parenthesis
 189         + "<'\u0029'"        // right parenthesis
 190         + "<'\u005b'"        // left bracket
 191         + "<'\u005d'"        // right bracket
 192         + "<'\u007b'"        // left brace
 193         + "<'\u007d'"        // right brace
 194         + "<\u00a7"          // section symbol
 195         + "<\u00b6"          // paragraph symbol
 196         + "<\u00a9"          // copyright symbol
 197         + "<\u00ae"          // registered trademark symbol
 198         + "<'\u0040'"          // at sign
 199         + "<\u00a4"          // international currency symbol
 200         + "<\u0e3f"          // baht sign
 201         + "<\u00a2"          // cent sign
 202         + "<\u20a1"          // colon sign
 203         + "<\u20a2"          // cruzeiro sign
 204         + "<'\u0024'"        // dollar sign
 205         + "<\u20ab"          // dong sign
 206         + "<\u20ac"          // euro sign
 207         + "<\u20a3"          // franc sign
 208         + "<\u20a4"          // lira sign
 209         + "<\u20a5"          // mill sign
 210         + "<\u20a6"          // naira sign
 211         + "<\u20a7"          // peseta sign
 212         + "<\u00a3"          // pound-sterling sign
 213         + "<\u20a8"          // rupee sign
 214         + "<\u20aa"          // new shekel sign
 215         + "<\u20a9"          // won sign
 216         + "<\u00a5"          // yen sign
 217         + "<'\u002a'"        // asterisk
 218         + "<'\\'"            // backslash
 219         + "<'\u0026'"        // ampersand
 220         + "<'\u0023'"        // number sign
 221         + "<'\u0025'"        // percent sign
 222         + "<'\u002b'"        // plus sign
 223         + "<\u00b1"          // plus-or-minus sign
 224         + "<\u00f7"          // divide sign
 225         + "<\u00d7"          // multiply sign
 226         + "<'\u003c'"        // less-than sign
 227         + "<'\u003d'"        // equal sign
 228         + "<'\u003e'"        // greater-than sign
 229         + "<\u00ac"          // end of line symbol/logical NOT symbol
 230         + "<'\u007c'"          // vertical line/logical OR symbol
 231         + "<\u00a6"          // broken vertical line
 232         + "<\u00b0"          // degree symbol
 233         + "<\u00b5"          // micro symbol
 234 
 235         // NUMERICS
 236 
 237         + "<0<1<2<3<4<5<6<7<8<9"
 238         + "<\u00bc<\u00bd<\u00be"   // 1/4,1/2,3/4 fractions
 239 
 240         // NON-IGNORABLES
 241         + "<a,A"
 242         + "<b,B"
 243         + "<c,C"
 244         + "<d,D"
 245         + "<\u00F0,\u00D0"                  // eth
 246         + "<e,E"
 247         + "<f,F"
 248         + "<g,G"
 249         + "<h,H"
 250         + "<i,I"
 251         + "<j,J"
 252         + "<k,K"
 253         + "<l,L"
 254         + "<m,M"
 255         + "<n,N"
 256         + "<o,O"
 257         + "<p,P"
 258         + "<q,Q"
 259         + "<r,R"
 260         + "<s, S & SS,\u00DF"             // s-zet
 261         + "<t,T"
 262         + "& TH, \u00DE &TH, \u00FE "     // thorn
 263         + "<u,U"
 264         + "<v,V"
 265         + "<w,W"
 266         + "<x,X"
 267         + "<y,Y"
 268         + "<z,Z"
 269         + "&AE,\u00C6"                    // ae & AE ligature
 270         + "&AE,\u00E6"
 271         + "&OE,\u0152"                    // oe & OE ligature
 272         + "&OE,\u0153";
 273 
 274     // No instantiation
 275     private CollationRules() {
 276     }
 277 }