1 /* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20 package jdk.nashorn.internal.runtime.regexp.joni; 21 22 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 24 25 import java.util.Arrays; 26 27 public final class EncodingHelper { 28 29 final static int NEW_LINE = 0x000a; 30 final static int RETURN = 0x000d; 31 final static int LINE_SEPARATOR = 0x2028; 32 final static int PARAGRAPH_SEPARATOR = 0x2029; 33 34 final static char[] EMPTYCHARS = new char[0]; 35 final static int[][] codeRanges = new int[15][]; 36 37 public static int digitVal(int code) { 38 return code - '0'; 39 } 40 41 public static int odigitVal(int code) { 42 return digitVal(code); 43 } 44 45 public static boolean isXDigit(int code) { 46 return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 47 } 48 49 public static int xdigitVal(int code) { 50 if (Character.isDigit(code)) { 51 return code - '0'; 52 } else if (code >= 'a' && code <= 'f') { 53 return code - 'a' + 10; 54 } else { 55 return code - 'A' + 10; 56 } 57 } 58 59 public static boolean isDigit(int code) { 60 return code >= '0' && code <= '9'; 61 } 62 63 public static boolean isWord(int code) { 64 // letter, digit, or '_' 65 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 66 } 67 68 public static boolean isNewLine(int code) { 69 return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 70 } 71 72 public static boolean isNewLine(char[] chars, int p, int end) { 73 return p < end && isNewLine(chars[p]); 74 } 75 76 // Encoding.prevCharHead 77 public static int prevCharHead(int p, int s) { 78 return s <= p ? -1 : s - 1; 79 } 80 81 /* onigenc_get_right_adjust_char_head_with_prev */ 82 public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) { 83 if (prev != null) prev.value = -1; /* Sorry */ 84 return s; 85 } 86 87 // Encoding.stepBack 88 public static int stepBack(int p, int s, int n) { 89 while (s != -1 && n-- > 0) { 90 if (s <= p) return -1; 91 s--; 92 } 93 return s; 94 } 95 96 public static int mbcToCode(byte[] bytes, int p, int end) { 97 int code = 0; 98 for (int i = p; i < end; i++) { 99 code = (code << 8) | (bytes[i] & 0xff); 100 } 101 return code; 102 } 103 104 public static int mbcodeStartPosition() { 105 return 0x80; 106 } 107 108 public static char[] caseFoldCodesByString(int flag, char c) { 109 if (Character.isUpperCase(c)) { 110 return new char[] {Character.toLowerCase(c)}; 111 } else if (Character.isLowerCase(c)) { 112 return new char[] {Character.toUpperCase(c)}; 113 } else { 114 return EMPTYCHARS; 115 } 116 } 117 118 public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) { 119 int[] code = new int[1]; 120 121 for (int c = 0; c < 0xffff; c++) { 122 if (Character.getType(c) == Character.LOWERCASE_LETTER) { 123 124 int upper = code[0] = Character.toUpperCase(c); 125 fun.apply(c, code, 1, arg); 126 127 code[0] = c; 128 fun.apply(upper, code, 1, arg); 129 } 130 } 131 } 132 133 public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) { 134 sbOut.value = 0x100; // use bitset for codes smaller than 256 135 int[] range = null; 136 137 if (ctype < codeRanges.length) { 138 range = codeRanges[ctype]; 139 140 if (range == null) { 141 // format: [numberOfRanges, rangeStart, rangeEnd, ...] 142 range = new int[16]; 143 int rangeCount = 0; 144 int lastCode = -2; 145 146 for (int code = 0; code <= 0xffff; code++) { 147 if (isCodeCType(code, ctype)) { 148 if (lastCode < code -1) { 149 if (rangeCount * 2 + 2 >= range.length) { 150 range = Arrays.copyOf(range, range.length * 2); 151 } 152 range[rangeCount * 2 + 1] = code; 153 rangeCount++; 154 } 155 range[rangeCount * 2] = lastCode = code; 156 } 157 } 158 159 if (rangeCount * 2 + 1 < range.length) { 160 range = Arrays.copyOf(range, rangeCount * 2 + 1); 161 } 162 163 range[0] = rangeCount; 164 codeRanges[ctype] = range; 165 } 166 } 167 168 return range; 169 } 170 171 // CodeRange.isInCodeRange 172 public static boolean isInCodeRange(int[] p, int offset, int code) { 173 int low = 0; 174 int n = p[offset]; 175 int high = n ; 176 177 while (low < high) { 178 int x = (low + high) >> 1; 179 if (code > p[(x << 1) + 2 + offset]) { 180 low = x + 1; 181 } else { 182 high = x; 183 } 184 } 185 return low < n && code >= p[(low << 1) + 1 + offset]; 186 } 187 188 /** 189 * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 190 */ 191 public static boolean isCodeCType(int code, int ctype) { 192 int type; 193 switch (ctype) { 194 case CharacterType.NEWLINE: 195 return isNewLine(code); 196 case CharacterType.ALPHA: 197 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 198 case CharacterType.BLANK: 199 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 200 case CharacterType.CNTRL: 201 type = Character.getType(code); 202 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 203 case CharacterType.DIGIT: 204 return EncodingHelper.isDigit(code); 205 case CharacterType.GRAPH: 206 switch (code) { 207 case 0x09: 208 case 0x0a: 209 case 0x0b: 210 case 0x0c: 211 case 0x0d: 212 return false; 213 default: 214 type = Character.getType(code); 215 return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 216 } 217 case CharacterType.LOWER: 218 return Character.isLowerCase(code); 219 case CharacterType.PRINT: 220 type = Character.getType(code); 221 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 222 case CharacterType.PUNCT: 223 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 224 case CharacterType.SPACE: 225 // ECMA 7.2 and 7.3 226 switch (code) { 227 case 0x09: 228 case 0x0a: 229 case 0x0b: 230 case 0x0c: 231 case 0x0d: 232 return true; 233 default: 234 // true if Unicode separator or BOM 235 return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff; 236 } 237 case CharacterType.UPPER: 238 return Character.isUpperCase(code); 239 case CharacterType.XDIGIT: 240 return EncodingHelper.isXDigit(code); 241 case CharacterType.WORD: 242 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 243 case CharacterType.ALNUM: 244 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 245 case CharacterType.ASCII: 246 return code < 0x80; 247 default: 248 throw new RuntimeException("illegal character type: " + ctype); 249 } 250 } 251 } 252 --- EOF ---