1 /*
   2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   3  * this software and associated documentation files (the "Software"), to deal in
   4  * the Software without restriction, including without limitation the rights to
   5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   6  * of the Software, and to permit persons to whom the Software is furnished to do
   7  * so, subject to the following conditions:
   8  *
   9  * The above copyright notice and this permission notice shall be included in all
  10  * copies or substantial portions of the Software.
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18  * SOFTWARE.
  19  */
  20 package jdk.nashorn.internal.joni;
  21 
  22 import jdk.nashorn.internal.joni.encoding.CharacterType;
  23 import jdk.nashorn.internal.joni.encoding.IntHolder;
  24 
  25 import java.util.Arrays;
  26 
  27 public class EncodingHelper {
  28 
  29     public final static char NEW_LINE = 0xa;
  30     public final static char RETURN   = 0xd;
  31 
  32     final static char[] EMPTYCHARS = new char[0];
  33     final static int[][] codeRanges = new int[15][];
  34 
  35     public static int digitVal(int code) {
  36         return code - '0';
  37     }
  38 
  39     public static int odigitVal(int code) {
  40         return digitVal(code);
  41     }
  42 
  43     public static boolean isXDigit(int code) {
  44         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
  45     }
  46 
  47     public static int xdigitVal(int code) {
  48         if (Character.isDigit(code)) {
  49             return code - '0';
  50         } else if (code >= 'a' && code <= 'f') {
  51             return code - 'a' + 10;
  52         } else {
  53             return code - 'A' + 10;
  54         }
  55     }
  56 
  57     public static boolean isDigit(int code) {
  58         return code >= '0' && code <= '9';
  59     }
  60 
  61     public static boolean isWord(int code) {
  62         // letter, digit, or '_'
  63         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
  64     }
  65 
  66     public static boolean isNewLine(int code) {
  67         return code == NEW_LINE;
  68     }
  69 
  70     public static boolean isNewLine(char[] chars, int p, int end) {
  71         return p < end && chars[p] == NEW_LINE;
  72     }
  73 
  74     public static boolean isCrnl(char[] chars, int p, int end) {
  75         return p + 1 < end && chars[p] == RETURN && chars[p + 1] == NEW_LINE;
  76     }
  77 
  78     // Encoding.prevCharHead
  79     public static int prevCharHead(int p, int s) {
  80         return s <= p ? -1 : s - 1;
  81     }
  82 
  83     /* onigenc_get_right_adjust_char_head_with_prev */
  84     public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) {
  85         if (prev != null) prev.value = -1; /* Sorry */
  86         return s;
  87     }
  88 
  89     // Encoding.stepBack
  90     public static int stepBack(int p, int s, int n) {
  91        while (s != -1 && n-- > 0) {
  92            if (s <= p) return -1;
  93            s--;
  94        }
  95        return s;
  96     }
  97 
  98     /* onigenc_with_ascii_strncmp */
  99     public static int strNCmp(char[] chars1, int p1, int end, char[] chars2, int p2, int n) {
 100         while (n-- > 0) {
 101             if (p1 >= end) return chars2[p2];
 102             int c = chars1[p1];
 103             int x = chars2[p2] - c;
 104             if (x != 0) return x;
 105 
 106             p2++;
 107             p1++;
 108         }
 109         return 0;
 110     }
 111 
 112     public static int mbcToCode(byte[] bytes, int p, int end) {
 113         int code = 0;
 114         for (int i = p; i < end; i++) {
 115             code = (code << 8) | (bytes[i] & 0xff);
 116         }
 117         return code;
 118     }
 119 
 120     public static int mbcodeStartPosition() {
 121         return 0x80;
 122     }
 123 
 124     public static char[] caseFoldCodesByString(int flag, char c) {
 125         if (Character.isUpperCase(c)) {
 126             return new char[] {Character.toLowerCase(c)};
 127         } else if (Character.isLowerCase(c)) {
 128             return new char[] {Character.toUpperCase(c)};
 129         } else {
 130             return EMPTYCHARS;
 131         }
 132     }
 133 
 134     public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) {
 135         int[] code = new int[1];
 136 
 137         for (int c = 0; c < 0xffff; c++) {
 138             if (Character.getType(c) == Character.LOWERCASE_LETTER) {
 139 
 140                 int upper = code[0] = Character.toUpperCase(c);
 141                 fun.apply(c, code, 1, arg);
 142 
 143                 code[0] = c;
 144                 fun.apply(upper, code, 1, arg);
 145             }
 146         }
 147     }
 148 
 149     // CodeRange.isInCodeRange
 150     public static boolean isInCodeRange(int[]p, int code) {
 151         int low = 0;
 152         int n = p[0];
 153         int high = n;
 154 
 155         while (low < high) {
 156             int x = (low + high) >> 1;
 157             if (code > p[(x << 1) + 2]) {
 158                 low = x + 1;
 159             } else {
 160                 high = x;
 161             }
 162         }
 163         return low < n && code >= p[(low << 1) + 1];
 164     }
 165 
 166     public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
 167         sbOut.value = 0x100; // use bitset for codes smaller than 256
 168         int[] range = null;
 169 
 170         if (ctype < codeRanges.length) {
 171             range = codeRanges[ctype];
 172 
 173             if (range == null) {
 174                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
 175                 range = new int[16];
 176                 int rangeCount = 0;
 177                 int lastCode = -2;
 178 
 179                 for (int code = 0; code <= 0xffff; code++) {
 180                     if (isCodeCType(code, ctype)) {
 181                         if (lastCode < code -1) {
 182                             if (rangeCount * 2 + 2 >= range.length) {
 183                                 range = Arrays.copyOf(range, range.length * 2);
 184                             }
 185                             range[rangeCount * 2 + 1] = code;
 186                             rangeCount++;
 187                         }
 188                         range[rangeCount * 2] = lastCode = code;
 189                     }
 190                 }
 191 
 192                 if (rangeCount * 2 + 1 < range.length) {
 193                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
 194                 }
 195 
 196                 range[0] = rangeCount;
 197                 codeRanges[ctype] = range;
 198             }
 199         }
 200 
 201         return range;
 202     }
 203 
 204     // CodeRange.isInCodeRange
 205     public static boolean isInCodeRange(int[]p, int offset, int code) {
 206         int low = 0;
 207         int n = p[offset];
 208         int high = n ;
 209 
 210         while (low < high) {
 211             int x = (low + high) >> 1;
 212             if (code > p[(x << 1) + 2 + offset]) {
 213                 low = x + 1;
 214             } else {
 215                 high = x;
 216             }
 217         }
 218         return low < n && code >= p[(low << 1) + 1 + offset];
 219     }
 220 
 221     /**
 222      * @see [http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt]
 223      */
 224     public static boolean isCodeCType(int code, int ctype) {
 225         int type;
 226         switch (ctype) {
 227             case CharacterType.NEWLINE:
 228                 return code == EncodingHelper.NEW_LINE;
 229             case CharacterType.ALPHA:
 230                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
 231             case CharacterType.BLANK:
 232                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
 233             case CharacterType.CNTRL:
 234                 type = Character.getType(code);
 235                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
 236             case CharacterType.DIGIT:
 237                 return EncodingHelper.isDigit(code);
 238             case CharacterType.GRAPH:
 239                 switch (code) {
 240                     case 0x09:
 241                     case 0x0a:
 242                     case 0x0b:
 243                     case 0x0c:
 244                     case 0x0d:
 245                         return false;
 246                     default:
 247                         type = Character.getType(code);
 248                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
 249                 }
 250             case CharacterType.LOWER:
 251                 return Character.isLowerCase(code);
 252             case CharacterType.PRINT:
 253                 type = Character.getType(code);
 254                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
 255             case CharacterType.PUNCT:
 256                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
 257             case CharacterType.SPACE:
 258                 // ECMA 7.2 and 7.3
 259                 switch (code) {
 260                     case 0x09:
 261                     case 0x0a:
 262                     case 0x0b:
 263                     case 0x0c:
 264                     case 0x0d:
 265                         return true;
 266                     default:
 267                         // true if Unicode separator or BOM
 268                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
 269                 }
 270             case CharacterType.UPPER:
 271                 return Character.isUpperCase(code);
 272             case CharacterType.XDIGIT:
 273                 return EncodingHelper.isXDigit(code);
 274             case CharacterType.WORD:
 275                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
 276             case CharacterType.ALNUM:
 277                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
 278             case CharacterType.ASCII:
 279                 return code < 0x80;
 280             default:
 281                 throw new RuntimeException("illegal character type: " + ctype);
 282         }
 283     }
 284 }
 285