1 /*
   2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   3  * this software and associated documentation files (the "Software"), to deal in
   4  * the Software without restriction, including without limitation the rights to
   5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   6  * of the Software, and to permit persons to whom the Software is furnished to do
   7  * so, subject to the following conditions:
   8  *
   9  * The above copyright notice and this permission notice shall be included in all
  10  * copies or substantial portions of the Software.
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18  * SOFTWARE.
  19  */
  20 package jdk.nashorn.internal.runtime.regexp.joni;
  21 
  22 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
  23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
  24 
  25 import java.util.Arrays;
  26 
  27 public final class EncodingHelper {
  28 
  29     final static int NEW_LINE            = 0x000a;
  30     final static int RETURN              = 0x000d;
  31     final static int LINE_SEPARATOR      = 0x2028;
  32     final static int PARAGRAPH_SEPARATOR = 0x2029;
  33 
  34     final static char[] EMPTYCHARS = new char[0];
  35     final static int[][] codeRanges = new int[15][];
  36 
  37     public static int digitVal(int code) {
  38         return code - '0';
  39     }
  40 
  41     public static int odigitVal(int code) {
  42         return digitVal(code);
  43     }
  44 
  45     public static boolean isXDigit(int code) {
  46         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
  47     }
  48 
  49     public static int xdigitVal(int code) {
  50         if (Character.isDigit(code)) {
  51             return code - '0';
  52         } else if (code >= 'a' && code <= 'f') {
  53             return code - 'a' + 10;
  54         } else {
  55             return code - 'A' + 10;
  56         }
  57     }
  58 
  59     public static boolean isDigit(int code) {
  60         return code >= '0' && code <= '9';
  61     }
  62 
  63     public static boolean isWord(int code) {
  64         // letter, digit, or '_'
  65         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
  66     }
  67 
  68     public static boolean isNewLine(int code) {
  69         return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
  70     }
  71 
  72     public static boolean isNewLine(char[] chars, int p, int end) {
  73         return p < end && isNewLine(chars[p]);
  74     }
  75 
  76     // Encoding.prevCharHead
  77     public static int prevCharHead(int p, int s) {
  78         return s <= p ? -1 : s - 1;
  79     }
  80 
  81     /* onigenc_get_right_adjust_char_head_with_prev */
  82     public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) {
  83         if (prev != null) prev.value = -1; /* Sorry */
  84         return s;
  85     }
  86 
  87     // Encoding.stepBack
  88     public static int stepBack(int p, int s, int n) {
  89        while (s != -1 && n-- > 0) {
  90            if (s <= p) return -1;
  91            s--;
  92        }
  93        return s;
  94     }
  95 
  96     public static int mbcodeStartPosition() {
  97         return 0x80;
  98     }
  99 
 100     public static char[] caseFoldCodesByString(int flag, char c) {
 101         char[] codes = EMPTYCHARS;
 102         final char upper = toUpperCase(c);
 103 
 104         if (upper != toLowerCase(upper)) {
 105             int count = 0;
 106             char ch = 0;
 107 
 108             do {
 109                 final char u = toUpperCase(ch);
 110                 if (u == upper && ch != c) {
 111                     // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
 112                     codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
 113                     codes[count++] = ch;
 114                 }
 115             } while (ch++ < 0xffff);
 116         }
 117         return codes;
 118     }
 119 
 120     public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) {
 121         for (int c = 0; c < 0xffff; c++) {
 122             if (Character.isLowerCase(c)) {
 123                 final int upper = toUpperCase(c);
 124 
 125                 if (upper != c) {
 126                     fun.apply(c, upper, arg);
 127                 }
 128             }
 129         }
 130 
 131         // Some characters have multiple lower case variants, hence we need to do a second run
 132         for (int c = 0; c < 0xffff; c++) {
 133             if (Character.isLowerCase(c)) {
 134                 final int upper = toUpperCase(c);
 135 
 136                 if (upper != c) {
 137                     fun.apply(upper, c, arg);
 138                 }
 139             }
 140         }
 141     }
 142 
 143     public static char toLowerCase(char c) {
 144         return (char)toLowerCase((int)c);
 145     }
 146 
 147     public static int toLowerCase(int c) {
 148         if (c < 128) {
 149             return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
 150         }
 151         // Do not convert non-ASCII upper case character to ASCII lower case.
 152         int lower = Character.toLowerCase(c);
 153         return (lower < 128) ? c : lower;
 154 
 155     }
 156 
 157     public static char toUpperCase(char c) {
 158         return (char)toUpperCase((int)c);
 159     }
 160 
 161     public static int toUpperCase(int c) {
 162         if (c < 128) {
 163             return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
 164         }
 165         // Do not convert non-ASCII lower case character to ASCII upper case.
 166         int upper = Character.toUpperCase(c);
 167         return (upper < 128) ? c : upper;
 168     }
 169 
 170     public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
 171         sbOut.value = 0x100; // use bitset for codes smaller than 256
 172         int[] range = null;
 173 
 174         if (ctype < codeRanges.length) {
 175             range = codeRanges[ctype];
 176 
 177             if (range == null) {
 178                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
 179                 range = new int[16];
 180                 int rangeCount = 0;
 181                 int lastCode = -2;
 182 
 183                 for (int code = 0; code <= 0xffff; code++) {
 184                     if (isCodeCType(code, ctype)) {
 185                         if (lastCode < code -1) {
 186                             if (rangeCount * 2 + 2 >= range.length) {
 187                                 range = Arrays.copyOf(range, range.length * 2);
 188                             }
 189                             range[rangeCount * 2 + 1] = code;
 190                             rangeCount++;
 191                         }
 192                         range[rangeCount * 2] = lastCode = code;
 193                     }
 194                 }
 195 
 196                 if (rangeCount * 2 + 1 < range.length) {
 197                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
 198                 }
 199 
 200                 range[0] = rangeCount;
 201                 codeRanges[ctype] = range;
 202             }
 203         }
 204 
 205         return range;
 206     }
 207 
 208     // CodeRange.isInCodeRange
 209     public static boolean isInCodeRange(int[] p, int offset, int code) {
 210         int low = 0;
 211         int n = p[offset];
 212         int high = n ;
 213 
 214         while (low < high) {
 215             int x = (low + high) >> 1;
 216             if (code > p[(x << 1) + 2 + offset]) {
 217                 low = x + 1;
 218             } else {
 219                 high = x;
 220             }
 221         }
 222         return low < n && code >= p[(low << 1) + 1 + offset];
 223     }
 224 
 225     /**
 226      * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
 227      */
 228     public static boolean isCodeCType(int code, int ctype) {
 229         int type;
 230         switch (ctype) {
 231             case CharacterType.NEWLINE:
 232                 return isNewLine(code);
 233             case CharacterType.ALPHA:
 234                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
 235             case CharacterType.BLANK:
 236                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
 237             case CharacterType.CNTRL:
 238                 type = Character.getType(code);
 239                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
 240             case CharacterType.DIGIT:
 241                 return EncodingHelper.isDigit(code);
 242             case CharacterType.GRAPH:
 243                 switch (code) {
 244                     case 0x09:
 245                     case 0x0a:
 246                     case 0x0b:
 247                     case 0x0c:
 248                     case 0x0d:
 249                         return false;
 250                     default:
 251                         type = Character.getType(code);
 252                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
 253                 }
 254             case CharacterType.LOWER:
 255                 return Character.isLowerCase(code);
 256             case CharacterType.PRINT:
 257                 type = Character.getType(code);
 258                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
 259             case CharacterType.PUNCT:
 260                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
 261             case CharacterType.SPACE:
 262                 // ECMA 7.2 and 7.3
 263                 switch (code) {
 264                     case 0x09:
 265                     case 0x0a:
 266                     case 0x0b:
 267                     case 0x0c:
 268                     case 0x0d:
 269                         return true;
 270                     default:
 271                         // true if Unicode separator or BOM
 272                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
 273                 }
 274             case CharacterType.UPPER:
 275                 return Character.isUpperCase(code);
 276             case CharacterType.XDIGIT:
 277                 return EncodingHelper.isXDigit(code);
 278             case CharacterType.WORD:
 279                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
 280             case CharacterType.ALNUM:
 281                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
 282             case CharacterType.ASCII:
 283                 return code < 0x80;
 284             default:
 285                 throw new RuntimeException("illegal character type: " + ctype);
 286         }
 287     }
 288 }
 289