1 /* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20 package jdk.nashorn.internal.runtime.regexp.joni; 21 22 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 24 25 import java.util.Arrays; 26 27 public class EncodingHelper { 28 29 public final static char NEW_LINE = 0xa; 30 public final static char RETURN = 0xd; 31 32 final static char[] EMPTYCHARS = new char[0]; 33 final static int[][] codeRanges = new int[15][]; 34 35 public static int digitVal(int code) { 36 return code - '0'; 37 } 38 39 public static int odigitVal(int code) { 40 return digitVal(code); 41 } 42 43 public static boolean isXDigit(int code) { 44 return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 45 } 46 47 public static int xdigitVal(int code) { 48 if (Character.isDigit(code)) { 49 return code - '0'; 50 } else if (code >= 'a' && code <= 'f') { 51 return code - 'a' + 10; 52 } else { 53 return code - 'A' + 10; 54 } 55 } 56 57 public static boolean isDigit(int code) { 58 return code >= '0' && code <= '9'; 59 } 60 61 public static boolean isWord(int code) { 62 // letter, digit, or '_' 63 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 64 } 65 66 public static boolean isNewLine(int code) { 67 return code == NEW_LINE; 68 } 69 70 public static boolean isNewLine(char[] chars, int p, int end) { 71 return p < end && chars[p] == NEW_LINE; 72 } 73 74 public static boolean isCrnl(char[] chars, int p, int end) { 75 return p + 1 < end && chars[p] == RETURN && chars[p + 1] == NEW_LINE; 76 } 77 78 // Encoding.prevCharHead 79 public static int prevCharHead(int p, int s) { 80 return s <= p ? -1 : s - 1; 81 } 82 83 /* onigenc_get_right_adjust_char_head_with_prev */ 84 public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) { 85 if (prev != null) prev.value = -1; /* Sorry */ 86 return s; 87 } 88 89 // Encoding.stepBack 90 public static int stepBack(int p, int s, int n) { 91 while (s != -1 && n-- > 0) { 92 if (s <= p) return -1; 93 s--; 94 } 95 return s; 96 } 97 98 /* onigenc_with_ascii_strncmp */ 99 public static int strNCmp(char[] chars1, int p1, int end, char[] chars2, int p2, int n) { 100 while (n-- > 0) { 101 if (p1 >= end) return chars2[p2]; 102 int c = chars1[p1]; 103 int x = chars2[p2] - c; 104 if (x != 0) return x; 105 106 p2++; 107 p1++; 108 } 109 return 0; 110 } 111 112 public static int mbcToCode(byte[] bytes, int p, int end) { 113 int code = 0; 114 for (int i = p; i < end; i++) { 115 code = (code << 8) | (bytes[i] & 0xff); 116 } 117 return code; 118 } 119 120 public static int mbcodeStartPosition() { 121 return 0x80; 122 } 123 124 public static char[] caseFoldCodesByString(int flag, char c) { 125 if (Character.isUpperCase(c)) { 126 return new char[] {Character.toLowerCase(c)}; 127 } else if (Character.isLowerCase(c)) { 128 return new char[] {Character.toUpperCase(c)}; 129 } else { 130 return EMPTYCHARS; 131 } 132 } 133 134 public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) { 135 int[] code = new int[1]; 136 137 for (int c = 0; c < 0xffff; c++) { 138 if (Character.getType(c) == Character.LOWERCASE_LETTER) { 139 140 int upper = code[0] = Character.toUpperCase(c); 141 fun.apply(c, code, 1, arg); 142 143 code[0] = c; 144 fun.apply(upper, code, 1, arg); 145 } 146 } 147 } 148 149 // CodeRange.isInCodeRange 150 public static boolean isInCodeRange(int[]p, int code) { 151 int low = 0; 152 int n = p[0]; 153 int high = n; 154 155 while (low < high) { 156 int x = (low + high) >> 1; 157 if (code > p[(x << 1) + 2]) { 158 low = x + 1; 159 } else { 160 high = x; 161 } 162 } 163 return low < n && code >= p[(low << 1) + 1]; 164 } 165 166 public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) { 167 sbOut.value = 0x100; // use bitset for codes smaller than 256 168 int[] range = null; 169 170 if (ctype < codeRanges.length) { 171 range = codeRanges[ctype]; 172 173 if (range == null) { 174 // format: [numberOfRanges, rangeStart, rangeEnd, ...] 175 range = new int[16]; 176 int rangeCount = 0; 177 int lastCode = -2; 178 179 for (int code = 0; code <= 0xffff; code++) { 180 if (isCodeCType(code, ctype)) { 181 if (lastCode < code -1) { 182 if (rangeCount * 2 + 2 >= range.length) { 183 range = Arrays.copyOf(range, range.length * 2); 184 } 185 range[rangeCount * 2 + 1] = code; 186 rangeCount++; 187 } 188 range[rangeCount * 2] = lastCode = code; 189 } 190 } 191 192 if (rangeCount * 2 + 1 < range.length) { 193 range = Arrays.copyOf(range, rangeCount * 2 + 1); 194 } 195 196 range[0] = rangeCount; 197 codeRanges[ctype] = range; 198 } 199 } 200 201 return range; 202 } 203 204 // CodeRange.isInCodeRange 205 public static boolean isInCodeRange(int[]p, int offset, int code) { 206 int low = 0; 207 int n = p[offset]; 208 int high = n ; 209 210 while (low < high) { 211 int x = (low + high) >> 1; 212 if (code > p[(x << 1) + 2 + offset]) { 213 low = x + 1; 214 } else { 215 high = x; 216 } 217 } 218 return low < n && code >= p[(low << 1) + 1 + offset]; 219 } 220 221 /** 222 * @see [http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt] 223 */ 224 public static boolean isCodeCType(int code, int ctype) { 225 int type; 226 switch (ctype) { 227 case CharacterType.NEWLINE: 228 return code == EncodingHelper.NEW_LINE; 229 case CharacterType.ALPHA: 230 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 231 case CharacterType.BLANK: 232 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 233 case CharacterType.CNTRL: 234 type = Character.getType(code); 235 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 236 case CharacterType.DIGIT: 237 return EncodingHelper.isDigit(code); 238 case CharacterType.GRAPH: 239 switch (code) { 240 case 0x09: 241 case 0x0a: 242 case 0x0b: 243 case 0x0c: 244 case 0x0d: 245 return false; 246 default: 247 type = Character.getType(code); 248 return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 249 } 250 case CharacterType.LOWER: 251 return Character.isLowerCase(code); 252 case CharacterType.PRINT: 253 type = Character.getType(code); 254 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 255 case CharacterType.PUNCT: 256 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 257 case CharacterType.SPACE: 258 // ECMA 7.2 and 7.3 259 switch (code) { 260 case 0x09: 261 case 0x0a: 262 case 0x0b: 263 case 0x0c: 264 case 0x0d: 265 return true; 266 default: 267 // true if Unicode separator or BOM 268 return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff; 269 } 270 case CharacterType.UPPER: 271 return Character.isUpperCase(code); 272 case CharacterType.XDIGIT: 273 return EncodingHelper.isXDigit(code); 274 case CharacterType.WORD: 275 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 276 case CharacterType.ALNUM: 277 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 278 case CharacterType.ASCII: 279 return code < 0x80; 280 default: 281 throw new RuntimeException("illegal character type: " + ctype); 282 } 283 } 284 } 285