1 /* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20 package jdk.nashorn.internal.runtime.regexp.joni; 21 22 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 24 25 import java.util.Arrays; 26 27 public final class EncodingHelper { 28 29 final static int NEW_LINE = 0x000a; 30 final static int RETURN = 0x000d; 31 final static int LINE_SEPARATOR = 0x2028; 32 final static int PARAGRAPH_SEPARATOR = 0x2029; 33 34 final static char[] EMPTYCHARS = new char[0]; 35 final static int[][] codeRanges = new int[15][]; 36 37 public static int digitVal(int code) { 38 return code - '0'; 39 } 40 41 public static int odigitVal(int code) { 42 return digitVal(code); 43 } 44 45 public static boolean isXDigit(int code) { 46 return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 47 } 48 49 public static int xdigitVal(int code) { 50 if (Character.isDigit(code)) { 51 return code - '0'; 52 } else if (code >= 'a' && code <= 'f') { 53 return code - 'a' + 10; 54 } else { 55 return code - 'A' + 10; 56 } 57 } 58 59 public static boolean isDigit(int code) { 60 return code >= '0' && code <= '9'; 61 } 62 63 public static boolean isWord(int code) { 64 // letter, digit, or '_' 65 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 66 } 67 68 public static boolean isNewLine(int code) { 69 return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 70 } 71 72 public static boolean isNewLine(char[] chars, int p, int end) { 73 return p < end && isNewLine(chars[p]); 74 } 75 76 // Encoding.prevCharHead 77 public static int prevCharHead(int p, int s) { 78 return s <= p ? -1 : s - 1; 79 } 80 81 /* onigenc_get_right_adjust_char_head_with_prev */ 82 public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) { 83 if (prev != null) prev.value = -1; /* Sorry */ 84 return s; 85 } 86 87 // Encoding.stepBack 88 public static int stepBack(int p, int s, int n) { 89 while (s != -1 && n-- > 0) { 90 if (s <= p) return -1; 91 s--; 92 } 93 return s; 94 } 95 96 public static int mbcodeStartPosition() { 97 return 0x80; 98 } 99 100 public static char[] caseFoldCodesByString(int flag, char c) { 101 char[] codes = EMPTYCHARS; 102 final char upper = toUpperCase(c); 103 104 if (upper != toLowerCase(upper)) { 105 int count = 0; 106 char ch = 0; 107 108 do { 109 final char u = toUpperCase(ch); 110 if (u == upper && ch != c) { 111 // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. 112 codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); 113 codes[count++] = ch; 114 } 115 } while (ch++ < 0xffff); 116 } 117 return codes; 118 } 119 120 public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) { 121 for (int c = 0; c < 0xffff; c++) { 122 if (Character.isLowerCase(c)) { 123 final int upper = toUpperCase(c); 124 125 if (upper != c) { 126 fun.apply(c, upper, arg); 127 } 128 } 129 } 130 131 // Some characters have multiple lower case variants, hence we need to do a second run 132 for (int c = 0; c < 0xffff; c++) { 133 if (Character.isLowerCase(c)) { 134 final int upper = toUpperCase(c); 135 136 if (upper != c) { 137 fun.apply(upper, c, arg); 138 } 139 } 140 } 141 } 142 143 public static char toLowerCase(char c) { 144 return (char)toLowerCase((int)c); 145 } 146 147 public static int toLowerCase(int c) { 148 if (c < 128) { 149 return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; 150 } 151 // Do not convert non-ASCII upper case character to ASCII lower case. 152 int lower = Character.toLowerCase(c); 153 return (lower < 128) ? c : lower; 154 155 } 156 157 public static char toUpperCase(char c) { 158 return (char)toUpperCase((int)c); 159 } 160 161 public static int toUpperCase(int c) { 162 if (c < 128) { 163 return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; 164 } 165 // Do not convert non-ASCII lower case character to ASCII upper case. 166 int upper = Character.toUpperCase(c); 167 return (upper < 128) ? c : upper; 168 } 169 170 public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) { 171 sbOut.value = 0x100; // use bitset for codes smaller than 256 172 int[] range = null; 173 174 if (ctype < codeRanges.length) { 175 range = codeRanges[ctype]; 176 177 if (range == null) { 178 // format: [numberOfRanges, rangeStart, rangeEnd, ...] 179 range = new int[16]; 180 int rangeCount = 0; 181 int lastCode = -2; 182 183 for (int code = 0; code <= 0xffff; code++) { 184 if (isCodeCType(code, ctype)) { 185 if (lastCode < code -1) { 186 if (rangeCount * 2 + 2 >= range.length) { 187 range = Arrays.copyOf(range, range.length * 2); 188 } 189 range[rangeCount * 2 + 1] = code; 190 rangeCount++; 191 } 192 range[rangeCount * 2] = lastCode = code; 193 } 194 } 195 196 if (rangeCount * 2 + 1 < range.length) { 197 range = Arrays.copyOf(range, rangeCount * 2 + 1); 198 } 199 200 range[0] = rangeCount; 201 codeRanges[ctype] = range; 202 } 203 } 204 205 return range; 206 } 207 208 // CodeRange.isInCodeRange 209 public static boolean isInCodeRange(int[] p, int offset, int code) { 210 int low = 0; 211 int n = p[offset]; 212 int high = n ; 213 214 while (low < high) { 215 int x = (low + high) >> 1; 216 if (code > p[(x << 1) + 2 + offset]) { 217 low = x + 1; 218 } else { 219 high = x; 220 } 221 } 222 return low < n && code >= p[(low << 1) + 1 + offset]; 223 } 224 225 /** 226 * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 227 */ 228 public static boolean isCodeCType(int code, int ctype) { 229 int type; 230 switch (ctype) { 231 case CharacterType.NEWLINE: 232 return isNewLine(code); 233 case CharacterType.ALPHA: 234 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 235 case CharacterType.BLANK: 236 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 237 case CharacterType.CNTRL: 238 type = Character.getType(code); 239 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 240 case CharacterType.DIGIT: 241 return EncodingHelper.isDigit(code); 242 case CharacterType.GRAPH: 243 switch (code) { 244 case 0x09: 245 case 0x0a: 246 case 0x0b: 247 case 0x0c: 248 case 0x0d: 249 return false; 250 default: 251 type = Character.getType(code); 252 return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 253 } 254 case CharacterType.LOWER: 255 return Character.isLowerCase(code); 256 case CharacterType.PRINT: 257 type = Character.getType(code); 258 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 259 case CharacterType.PUNCT: 260 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 261 case CharacterType.SPACE: 262 // ECMA 7.2 and 7.3 263 switch (code) { 264 case 0x09: 265 case 0x0a: 266 case 0x0b: 267 case 0x0c: 268 case 0x0d: 269 return true; 270 default: 271 // true if Unicode separator or BOM 272 return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff; 273 } 274 case CharacterType.UPPER: 275 return Character.isUpperCase(code); 276 case CharacterType.XDIGIT: 277 return EncodingHelper.isXDigit(code); 278 case CharacterType.WORD: 279 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 280 case CharacterType.ALNUM: 281 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 282 case CharacterType.ASCII: 283 return code < 0x80; 284 default: 285 throw new RuntimeException("illegal character type: " + ctype); 286 } 287 } 288 } 289 --- EOF ---