1 /*
   2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   3  * this software and associated documentation files (the "Software"), to deal in
   4  * the Software without restriction, including without limitation the rights to
   5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   6  * of the Software, and to permit persons to whom the Software is furnished to do
   7  * so, subject to the following conditions:
   8  *
   9  * The above copyright notice and this permission notice shall be included in all
  10  * copies or substantial portions of the Software.
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18  * SOFTWARE.
  19  */
  20 package jdk.nashorn.internal.runtime.regexp.joni;
  21 
  22 import java.util.Arrays;
  23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
  24 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
  25 
  26 @SuppressWarnings("javadoc")
  27 public final class EncodingHelper {
  28 
  29     final static int NEW_LINE            = 0x000a;
  30     final static int RETURN              = 0x000d;
  31     final static int LINE_SEPARATOR      = 0x2028;
  32     final static int PARAGRAPH_SEPARATOR = 0x2029;
  33 
  34     final static char[] EMPTYCHARS = new char[0];
  35     final static int[][] codeRanges = new int[15][];
  36 
  37     public static int digitVal(final int code) {
  38         return code - '0';
  39     }
  40 
  41     public static int odigitVal(final int code) {
  42         return digitVal(code);
  43     }
  44 
  45     public static boolean isXDigit(final int code) {
  46         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
  47     }
  48 
  49     public static int xdigitVal(final int code) {
  50         if (Character.isDigit(code)) {
  51             return code - '0';
  52         } else if (code >= 'a' && code <= 'f') {
  53             return code - 'a' + 10;
  54         } else {
  55             return code - 'A' + 10;
  56         }
  57     }
  58 
  59     public static boolean isDigit(final int code) {
  60         return code >= '0' && code <= '9';
  61     }
  62 
  63     public static boolean isWord(final int code) {
  64         // letter, digit, or '_'
  65         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
  66     }
  67 
  68     public static boolean isNewLine(final int code) {
  69         return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
  70     }
  71 
  72     public static boolean isNewLine(final char[] chars, final int p, final int end) {
  73         return p < end && isNewLine(chars[p]);
  74     }
  75 
  76     // Encoding.prevCharHead
  77     public static int prevCharHead(final int p, final int s) {
  78         return s <= p ? -1 : s - 1;
  79     }
  80 
  81     /* onigenc_get_right_adjust_char_head_with_prev */
  82     public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
  83         if (prev != null) {
  84             prev.value = -1; /* Sorry */
  85         }
  86         return s;
  87     }
  88 
  89     // Encoding.stepBack
  90     public static int stepBack(final int p, final int sp, final int np) {
  91         int s = sp, n = np;
  92         while (s != -1 && n-- > 0) {
  93            if (s <= p) {
  94             return -1;
  95         }
  96            s--;
  97        }
  98        return s;
  99     }
 100 
 101     public static int mbcodeStartPosition() {
 102         return 0x80;
 103     }
 104 
 105     public static char[] caseFoldCodesByString(final int flag, final char c) {
 106         char[] codes = EMPTYCHARS;
 107         final char upper = toUpperCase(c);
 108 
 109         if (upper != toLowerCase(upper)) {
 110             int count = 0;
 111             char ch = 0;
 112 
 113             do {
 114                 final char u = toUpperCase(ch);
 115                 if (u == upper && ch != c) {
 116                     // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
 117                     codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
 118                     codes[count++] = ch;
 119                 }
 120             } while (ch++ < 0xffff);
 121         }
 122         return codes;
 123     }
 124 
 125     public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
 126         for (int c = 0; c < 0xffff; c++) {
 127             if (Character.isLowerCase(c)) {
 128                 final int upper = toUpperCase(c);
 129 
 130                 if (upper != c) {
 131                     ApplyCaseFold.apply(c, upper, arg);
 132                 }
 133             }
 134         }
 135 
 136         // Some characters have multiple lower case variants, hence we need to do a second run
 137         for (int c = 0; c < 0xffff; c++) {
 138             if (Character.isLowerCase(c)) {
 139                 final int upper = toUpperCase(c);
 140 
 141                 if (upper != c) {
 142                     ApplyCaseFold.apply(upper, c, arg);
 143                 }
 144             }
 145         }
 146     }
 147 
 148     public static char toLowerCase(final char c) {
 149         return (char)toLowerCase((int)c);
 150     }
 151 
 152     public static int toLowerCase(final int c) {
 153         if (c < 128) {
 154             return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
 155         }
 156         // Do not convert non-ASCII upper case character to ASCII lower case.
 157         final int lower = Character.toLowerCase(c);
 158         return (lower < 128) ? c : lower;
 159 
 160     }
 161 
 162     public static char toUpperCase(final char c) {
 163         return (char)toUpperCase((int)c);
 164     }
 165 
 166     public static int toUpperCase(final int c) {
 167         if (c < 128) {
 168             return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
 169         }
 170         // Do not convert non-ASCII lower case character to ASCII upper case.
 171         final int upper = Character.toUpperCase(c);
 172         return (upper < 128) ? c : upper;
 173     }
 174 
 175     public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
 176         sbOut.value = 0x100; // use bitset for codes smaller than 256
 177         int[] range = null;
 178 
 179         if (ctype < codeRanges.length) {
 180             range = codeRanges[ctype];
 181 
 182             if (range == null) {
 183                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
 184                 range = new int[16];
 185                 int rangeCount = 0;
 186                 int lastCode = -2;
 187 
 188                 for (int code = 0; code <= 0xffff; code++) {
 189                     if (isCodeCType(code, ctype)) {
 190                         if (lastCode < code -1) {
 191                             if (rangeCount * 2 + 2 >= range.length) {
 192                                 range = Arrays.copyOf(range, range.length * 2);
 193                             }
 194                             range[rangeCount * 2 + 1] = code;
 195                             rangeCount++;
 196                         }
 197                         range[rangeCount * 2] = lastCode = code;
 198                     }
 199                 }
 200 
 201                 if (rangeCount * 2 + 1 < range.length) {
 202                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
 203                 }
 204 
 205                 range[0] = rangeCount;
 206                 codeRanges[ctype] = range;
 207             }
 208         }
 209 
 210         return range;
 211     }
 212 
 213     // CodeRange.isInCodeRange
 214     public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
 215         int low = 0;
 216         final int n = p[offset];
 217         int high = n ;
 218 
 219         while (low < high) {
 220             final int x = (low + high) >> 1;
 221             if (code > p[(x << 1) + 2 + offset]) {
 222                 low = x + 1;
 223             } else {
 224                 high = x;
 225             }
 226         }
 227         return low < n && code >= p[(low << 1) + 1 + offset];
 228     }
 229 
 230     /**
 231      * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
 232      *
 233      * @param code code
 234      * @param ctype ctype
 235      *
 236      * @return isCodeCType
 237      */
 238     public static boolean isCodeCType(final int code, final int ctype) {
 239         int type;
 240         switch (ctype) {
 241             case CharacterType.NEWLINE:
 242                 return isNewLine(code);
 243             case CharacterType.ALPHA:
 244                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
 245             case CharacterType.BLANK:
 246                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
 247             case CharacterType.CNTRL:
 248                 type = Character.getType(code);
 249                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
 250             case CharacterType.DIGIT:
 251                 return EncodingHelper.isDigit(code);
 252             case CharacterType.GRAPH:
 253                 switch (code) {
 254                     case 0x09:
 255                     case 0x0a:
 256                     case 0x0b:
 257                     case 0x0c:
 258                     case 0x0d:
 259                         return false;
 260                     default:
 261                         type = Character.getType(code);
 262                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
 263                 }
 264             case CharacterType.LOWER:
 265                 return Character.isLowerCase(code);
 266             case CharacterType.PRINT:
 267                 type = Character.getType(code);
 268                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
 269             case CharacterType.PUNCT:
 270                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
 271             case CharacterType.SPACE:
 272                 // ECMA 7.2 and 7.3
 273                 switch (code) {
 274                     case 0x09:
 275                     case 0x0a:
 276                     case 0x0b:
 277                     case 0x0c:
 278                     case 0x0d:
 279                         return true;
 280                     default:
 281                         // true if Unicode separator or BOM or U+180E (see JDK-8138758)
 282                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0
 283                                 || code == 0xfeff || code == 0x180e;
 284                 }
 285             case CharacterType.UPPER:
 286                 return Character.isUpperCase(code);
 287             case CharacterType.XDIGIT:
 288                 return EncodingHelper.isXDigit(code);
 289             case CharacterType.WORD:
 290                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
 291             case CharacterType.ALNUM:
 292                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
 293             case CharacterType.ASCII:
 294                 return code < 0x80;
 295             default:
 296                 throw new RuntimeException("illegal character type: " + ctype);
 297         }
 298     }
 299 }
 300