1 /* 2 * Copyright (c) 2010, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2010, International Business Machines Corporation and * 29 * others. All Rights Reserved. * 30 ******************************************************************************* 31 */ 32 package sun.util.locale; 33 34 import java.util.ArrayList; 35 import java.util.Collections; 36 import java.util.HashMap; 37 import java.util.List; 38 import java.util.Map; 39 import java.util.Set; 40 41 public class LanguageTag { 42 // 43 // static fields 44 // 45 public static final String SEP = "-"; 46 public static final String PRIVATEUSE = "x"; 47 public static String UNDETERMINED = "und"; 48 public static final String PRIVUSE_VARIANT_PREFIX = "lvariant"; 49 50 // 51 // Language subtag fields 52 // 53 private String _language = ""; // language subtag 54 private String _script = ""; // script subtag 55 private String _region = ""; // region subtag 56 private String _privateuse = ""; // privateuse 57 58 private List<String> _extlangs = Collections.emptyList(); // extlang subtags 59 private List<String> _variants = Collections.emptyList(); // variant subtags 60 private List<String> _extensions = Collections.emptyList(); // extensions 61 62 // Map contains grandfathered tags and its preferred mappings from 63 // http://www.ietf.org/rfc/rfc5646.txt 64 private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED = 65 new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>(); 66 67 static { 68 // grandfathered = irregular ; non-redundant tags registered 69 // / regular ; during the RFC 3066 era 70 // 71 // irregular = "en-GB-oed" ; irregular tags do not match 72 // / "i-ami" ; the 'langtag' production and 73 // / "i-bnn" ; would not otherwise be 74 // / "i-default" ; considered 'well-formed' 75 // / "i-enochian" ; These tags are all valid, 76 // / "i-hak" ; but most are deprecated 77 // / "i-klingon" ; in favor of more modern 78 // / "i-lux" ; subtags or subtag 79 // / "i-mingo" ; combination 80 // / "i-navajo" 81 // / "i-pwn" 82 // / "i-tao" 83 // / "i-tay" 84 // / "i-tsu" 85 // / "sgn-BE-FR" 86 // / "sgn-BE-NL" 87 // / "sgn-CH-DE" 88 // 89 // regular = "art-lojban" ; these tags match the 'langtag' 90 // / "cel-gaulish" ; production, but their subtags 91 // / "no-bok" ; are not extended language 92 // / "no-nyn" ; or variant subtags: their meaning 93 // / "zh-guoyu" ; is defined by their registration 94 // / "zh-hakka" ; and all of these are deprecated 95 // / "zh-min" ; in favor of a more modern 96 // / "zh-min-nan" ; subtag or sequence of subtags 97 // / "zh-xiang" 98 99 final String[][] entries = { 100 //{"tag", "preferred"}, 101 {"art-lojban", "jbo"}, 102 {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback 103 {"en-GB-oed", "en-GB-x-oed"}, // fallback 104 {"i-ami", "ami"}, 105 {"i-bnn", "bnn"}, 106 {"i-default", "en-x-i-default"}, // fallback 107 {"i-enochian", "und-x-i-enochian"}, // fallback 108 {"i-hak", "hak"}, 109 {"i-klingon", "tlh"}, 110 {"i-lux", "lb"}, 111 {"i-mingo", "see-x-i-mingo"}, // fallback 112 {"i-navajo", "nv"}, 113 {"i-pwn", "pwn"}, 114 {"i-tao", "tao"}, 115 {"i-tay", "tay"}, 116 {"i-tsu", "tsu"}, 117 {"no-bok", "nb"}, 118 {"no-nyn", "nn"}, 119 {"sgn-BE-FR", "sfb"}, 120 {"sgn-BE-NL", "vgt"}, 121 {"sgn-CH-DE", "sgg"}, 122 {"zh-guoyu", "cmn"}, 123 {"zh-hakka", "hak"}, 124 {"zh-min", "nan-x-zh-min"}, // fallback 125 {"zh-min-nan", "nan"}, 126 {"zh-xiang", "hsn"}, 127 }; 128 for (String[] e : entries) { 129 GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); 130 } 131 } 132 133 private LanguageTag() { 134 } 135 136 /* 137 * BNF in RFC5464 138 * 139 * Language-Tag = langtag ; normal language tags 140 * / privateuse ; private use tag 141 * / grandfathered ; grandfathered tags 142 * 143 * 144 * langtag = language 145 * ["-" script] 146 * ["-" region] 147 * *("-" variant) 148 * *("-" extension) 149 * ["-" privateuse] 150 * 151 * language = 2*3ALPHA ; shortest ISO 639 code 152 * ["-" extlang] ; sometimes followed by 153 * ; extended language subtags 154 * / 4ALPHA ; or reserved for future use 155 * / 5*8ALPHA ; or registered language subtag 156 * 157 * extlang = 3ALPHA ; selected ISO 639 codes 158 * *2("-" 3ALPHA) ; permanently reserved 159 * 160 * script = 4ALPHA ; ISO 15924 code 161 * 162 * region = 2ALPHA ; ISO 3166-1 code 163 * / 3DIGIT ; UN M.49 code 164 * 165 * variant = 5*8alphanum ; registered variants 166 * / (DIGIT 3alphanum) 167 * 168 * extension = singleton 1*("-" (2*8alphanum)) 169 * 170 * ; Single alphanumerics 171 * ; "x" reserved for private use 172 * singleton = DIGIT ; 0 - 9 173 * / %x41-57 ; A - W 174 * / %x59-5A ; Y - Z 175 * / %x61-77 ; a - w 176 * / %x79-7A ; y - z 177 * 178 * privateuse = "x" 1*("-" (1*8alphanum)) 179 * 180 */ 181 public static LanguageTag parse(String languageTag, ParseStatus sts) { 182 if (sts == null) { 183 sts = new ParseStatus(); 184 } else { 185 sts.reset(); 186 } 187 188 StringTokenIterator itr; 189 190 // Check if the tag is grandfathered 191 String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); 192 if (gfmap != null) { 193 // use preferred mapping 194 itr = new StringTokenIterator(gfmap[1], SEP); 195 } else { 196 itr = new StringTokenIterator(languageTag, SEP); 197 } 198 199 LanguageTag tag = new LanguageTag(); 200 201 // langtag must start with either language or privateuse 202 if (tag.parseLanguage(itr, sts)) { 203 tag.parseExtlangs(itr, sts); 204 tag.parseScript(itr, sts); 205 tag.parseRegion(itr, sts); 206 tag.parseVariants(itr, sts); 207 tag.parseExtensions(itr, sts); 208 } 209 tag.parsePrivateuse(itr, sts); 210 211 if (!itr.isDone() && !sts.isError()) { 212 String s = itr.current(); 213 sts._errorIndex = itr.currentStart(); 214 if (s.length() == 0) { 215 sts._errorMsg = "Empty subtag"; 216 } else { 217 sts._errorMsg = "Invalid subtag: " + s; 218 } 219 } 220 221 return tag; 222 } 223 224 // 225 // Language subtag parsers 226 // 227 228 private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) { 229 if (itr.isDone() || sts.isError()) { 230 return false; 231 } 232 233 boolean found = false; 234 235 String s = itr.current(); 236 if (isLanguage(s)) { 237 found = true; 238 _language = s; 239 sts._parseLength = itr.currentEnd(); 240 itr.next(); 241 } 242 243 return found; 244 } 245 246 private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) { 247 if (itr.isDone() || sts.isError()) { 248 return false; 249 } 250 251 boolean found = false; 252 253 while (!itr.isDone()) { 254 String s = itr.current(); 255 if (!isExtlang(s)) { 256 break; 257 } 258 found = true; 259 if (_extlangs.isEmpty()) { 260 _extlangs = new ArrayList<String>(3); 261 } 262 _extlangs.add(s); 263 sts._parseLength = itr.currentEnd(); 264 itr.next(); 265 266 if (_extlangs.size() == 3) { 267 // Maximum 3 extlangs 268 break; 269 } 270 } 271 272 return found; 273 } 274 275 private boolean parseScript(StringTokenIterator itr, ParseStatus sts) { 276 if (itr.isDone() || sts.isError()) { 277 return false; 278 } 279 280 boolean found = false; 281 282 String s = itr.current(); 283 if (isScript(s)) { 284 found = true; 285 _script = s; 286 sts._parseLength = itr.currentEnd(); 287 itr.next(); 288 } 289 290 return found; 291 } 292 293 private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) { 294 if (itr.isDone() || sts.isError()) { 295 return false; 296 } 297 298 boolean found = false; 299 300 String s = itr.current(); 301 if (isRegion(s)) { 302 found = true; 303 _region = s; 304 sts._parseLength = itr.currentEnd(); 305 itr.next(); 306 } 307 308 return found; 309 } 310 311 private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) { 312 if (itr.isDone() || sts.isError()) { 313 return false; 314 } 315 316 boolean found = false; 317 318 while (!itr.isDone()) { 319 String s = itr.current(); 320 if (!isVariant(s)) { 321 break; 322 } 323 found = true; 324 if (_variants.isEmpty()) { 325 _variants = new ArrayList<String>(3); 326 } 327 _variants.add(s); 328 sts._parseLength = itr.currentEnd(); 329 itr.next(); 330 } 331 332 return found; 333 } 334 335 private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) { 336 if (itr.isDone() || sts.isError()) { 337 return false; 338 } 339 340 boolean found = false; 341 342 while (!itr.isDone()) { 343 String s = itr.current(); 344 if (isExtensionSingleton(s)) { 345 int start = itr.currentStart(); 346 String singleton = s; 347 StringBuilder sb = new StringBuilder(singleton); 348 349 itr.next(); 350 while (!itr.isDone()) { 351 s = itr.current(); 352 if (isExtensionSubtag(s)) { 353 sb.append(SEP).append(s); 354 sts._parseLength = itr.currentEnd(); 355 } else { 356 break; 357 } 358 itr.next(); 359 } 360 361 if (sts._parseLength <= start) { 362 sts._errorIndex = start; 363 sts._errorMsg = "Incomplete extension '" + singleton + "'"; 364 break; 365 } 366 367 if (_extensions.size() == 0) { 368 _extensions = new ArrayList<String>(4); 369 } 370 _extensions.add(sb.toString()); 371 found = true; 372 } else { 373 break; 374 } 375 } 376 return found; 377 } 378 379 private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) { 380 if (itr.isDone() || sts.isError()) { 381 return false; 382 } 383 384 boolean found = false; 385 386 String s = itr.current(); 387 if (isPrivateusePrefix(s)) { 388 int start = itr.currentStart(); 389 StringBuilder sb = new StringBuilder(s); 390 391 itr.next(); 392 while (!itr.isDone()) { 393 s = itr.current(); 394 if (!isPrivateuseSubtag(s)) { 395 break; 396 } 397 sb.append(SEP).append(s); 398 sts._parseLength = itr.currentEnd(); 399 400 itr.next(); 401 } 402 403 if (sts._parseLength <= start) { 404 // need at least 1 private subtag 405 sts._errorIndex = start; 406 sts._errorMsg = "Incomplete privateuse"; 407 } else { 408 _privateuse = sb.toString(); 409 found = true; 410 } 411 } 412 413 return found; 414 } 415 416 public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { 417 LanguageTag tag = new LanguageTag(); 418 419 String language = baseLocale.getLanguage(); 420 String script = baseLocale.getScript(); 421 String region = baseLocale.getRegion(); 422 String variant = baseLocale.getVariant(); 423 424 boolean hasSubtag = false; 425 426 String privuseVar = null; // store ill-formed variant subtags 427 428 if (language.length() > 0 && isLanguage(language)) { 429 // Convert a deprecated language code used by Java to 430 // a new code 431 if (language.equals("iw")) { 432 language = "he"; 433 } else if (language.equals("ji")) { 434 language = "yi"; 435 } else if (language.equals("in")) { 436 language = "id"; 437 } 438 tag._language = language; 439 } 440 441 if (script.length() > 0 && isScript(script)) { 442 tag._script = canonicalizeScript(script); 443 hasSubtag = true; 444 } 445 446 if (region.length() > 0 && isRegion(region)) { 447 tag._region = canonicalizeRegion(region); 448 hasSubtag = true; 449 } 450 451 // Special handling for no_NO_NY - use nn_NO for language tag 452 if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) { 453 tag._language = "nn"; 454 variant = ""; 455 } 456 457 if (variant.length() > 0) { 458 List<String> variants = null; 459 StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP); 460 while (!varitr.isDone()) { 461 String var = varitr.current(); 462 if (!isVariant(var)) { 463 break; 464 } 465 if (variants == null) { 466 variants = new ArrayList<String>(); 467 } 468 variants.add(var); // Do not canonicalize! 469 varitr.next(); 470 } 471 if (variants != null) { 472 tag._variants = variants; 473 hasSubtag = true; 474 } 475 if (!varitr.isDone()) { 476 // ill-formed variant subtags 477 StringBuilder buf = new StringBuilder(); 478 while (!varitr.isDone()) { 479 String prvv = varitr.current(); 480 if (!isPrivateuseSubtag(prvv)) { 481 // cannot use private use subtag - truncated 482 break; 483 } 484 if (buf.length() > 0) { 485 buf.append(SEP); 486 } 487 buf.append(prvv); 488 varitr.next(); 489 } 490 if (buf.length() > 0) { 491 privuseVar = buf.toString(); 492 } 493 } 494 } 495 496 List<String> extensions = null; 497 String privateuse = null; 498 499 Set<Character> locextKeys = localeExtensions.getKeys(); 500 for (Character locextKey : locextKeys) { 501 Extension ext = localeExtensions.getExtension(locextKey); 502 if (isPrivateusePrefixChar(locextKey.charValue())) { 503 privateuse = ext.getValue(); 504 } else { 505 if (extensions == null) { 506 extensions = new ArrayList<String>(); 507 } 508 extensions.add(locextKey.toString() + SEP + ext.getValue()); 509 } 510 } 511 512 if (extensions != null) { 513 tag._extensions = extensions; 514 hasSubtag = true; 515 } 516 517 // append ill-formed variant subtags to private use 518 if (privuseVar != null) { 519 if (privateuse == null) { 520 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar; 521 } else { 522 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP); 523 } 524 } 525 526 if (privateuse != null) { 527 tag._privateuse = privateuse; 528 } 529 530 if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) { 531 // use lang "und" when 1) no language is available AND 532 // 2) any of other subtags other than private use are available or 533 // no private use tag is available 534 tag._language = UNDETERMINED; 535 } 536 537 return tag; 538 } 539 540 // 541 // Getter methods for language subtag fields 542 // 543 544 public String getLanguage() { 545 return _language; 546 } 547 548 public List<String> getExtlangs() { 549 return Collections.unmodifiableList(_extlangs); 550 } 551 552 public String getScript() { 553 return _script; 554 } 555 556 public String getRegion() { 557 return _region; 558 } 559 560 public List<String> getVariants() { 561 return Collections.unmodifiableList(_variants); 562 } 563 564 public List<String> getExtensions() { 565 return Collections.unmodifiableList(_extensions); 566 } 567 568 public String getPrivateuse() { 569 return _privateuse; 570 } 571 572 // 573 // Language subtag syntax checking methods 574 // 575 576 public static boolean isLanguage(String s) { 577 // language = 2*3ALPHA ; shortest ISO 639 code 578 // ["-" extlang] ; sometimes followed by 579 // ; extended language subtags 580 // / 4ALPHA ; or reserved for future use 581 // / 5*8ALPHA ; or registered language subtag 582 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s); 583 } 584 585 public static boolean isExtlang(String s) { 586 // extlang = 3ALPHA ; selected ISO 639 codes 587 // *2("-" 3ALPHA) ; permanently reserved 588 return (s.length() == 3) && AsciiUtil.isAlphaString(s); 589 } 590 591 public static boolean isScript(String s) { 592 // script = 4ALPHA ; ISO 15924 code 593 return (s.length() == 4) && AsciiUtil.isAlphaString(s); 594 } 595 596 public static boolean isRegion(String s) { 597 // region = 2ALPHA ; ISO 3166-1 code 598 // / 3DIGIT ; UN M.49 code 599 return ((s.length() == 2) && AsciiUtil.isAlphaString(s)) 600 || ((s.length() == 3) && AsciiUtil.isNumericString(s)); 601 } 602 603 public static boolean isVariant(String s) { 604 // variant = 5*8alphanum ; registered variants 605 // / (DIGIT 3alphanum) 606 int len = s.length(); 607 if (len >= 5 && len <= 8) { 608 return AsciiUtil.isAlphaNumericString(s); 609 } 610 if (len == 4) { 611 return AsciiUtil.isNumeric(s.charAt(0)) 612 && AsciiUtil.isAlphaNumeric(s.charAt(1)) 613 && AsciiUtil.isAlphaNumeric(s.charAt(2)) 614 && AsciiUtil.isAlphaNumeric(s.charAt(3)); 615 } 616 return false; 617 } 618 619 public static boolean isExtensionSingleton(String s) { 620 // singleton = DIGIT ; 0 - 9 621 // / %x41-57 ; A - W 622 // / %x59-5A ; Y - Z 623 // / %x61-77 ; a - w 624 // / %x79-7A ; y - z 625 626 return (s.length() == 1) 627 && AsciiUtil.isAlphaString(s) 628 && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 629 } 630 631 public static boolean isExtensionSingletonChar(char c) { 632 return isExtensionSingleton(String.valueOf(c)); 633 } 634 635 public static boolean isExtensionSubtag(String s) { 636 // extension = singleton 1*("-" (2*8alphanum)) 637 return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 638 } 639 640 public static boolean isPrivateusePrefix(String s) { 641 // privateuse = "x" 1*("-" (1*8alphanum)) 642 return (s.length() == 1) 643 && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s); 644 } 645 646 public static boolean isPrivateusePrefixChar(char c) { 647 return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c))); 648 } 649 650 public static boolean isPrivateuseSubtag(String s) { 651 // privateuse = "x" 1*("-" (1*8alphanum)) 652 return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); 653 } 654 655 // 656 // Language subtag canonicalization methods 657 // 658 659 public static String canonicalizeLanguage(String s) { 660 return AsciiUtil.toLowerString(s); 661 } 662 663 public static String canonicalizeExtlang(String s) { 664 return AsciiUtil.toLowerString(s); 665 } 666 667 public static String canonicalizeScript(String s) { 668 return AsciiUtil.toTitleString(s); 669 } 670 671 public static String canonicalizeRegion(String s) { 672 return AsciiUtil.toUpperString(s); 673 } 674 675 public static String canonicalizeVariant(String s) { 676 return AsciiUtil.toLowerString(s); 677 } 678 679 public static String canonicalizeExtension(String s) { 680 return AsciiUtil.toLowerString(s); 681 } 682 683 public static String canonicalizeExtensionSingleton(String s) { 684 return AsciiUtil.toLowerString(s); 685 } 686 687 public static String canonicalizeExtensionSubtag(String s) { 688 return AsciiUtil.toLowerString(s); 689 } 690 691 public static String canonicalizePrivateuse(String s) { 692 return AsciiUtil.toLowerString(s); 693 } 694 695 public static String canonicalizePrivateuseSubtag(String s) { 696 return AsciiUtil.toLowerString(s); 697 } 698 699 public String toString() { 700 StringBuilder sb = new StringBuilder(); 701 702 if (_language.length() > 0) { 703 sb.append(_language); 704 705 for (String extlang : _extlangs) { 706 sb.append(SEP).append(extlang); 707 } 708 709 if (_script.length() > 0) { 710 sb.append(SEP).append(_script); 711 } 712 713 if (_region.length() > 0) { 714 sb.append(SEP).append(_region); 715 } 716 717 for (String variant : _extlangs) { 718 sb.append(SEP).append(variant); 719 } 720 721 for (String extension : _extensions) { 722 sb.append(SEP).append(extension); 723 } 724 } 725 if (_privateuse.length() > 0) { 726 if (sb.length() > 0) { 727 sb.append(SEP); 728 } 729 sb.append(_privateuse); 730 } 731 732 return sb.toString(); 733 } 734 }