1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
273 // if the character is opening punctuation, verify that no nesting
274 // rules are broken, and push the character onto the stack
275 case '{':
276 case '<':
277 case '[':
278 case '(':
279 if (lastOpen == '<') {
280 error("Can't nest brackets inside <>", p, description);
281 }
282 if (lastOpen == '[' && c != '[') {
283 error("Can't nest anything in [] but []", p, description);
284 }
285
286 // if we see < anywhere except on the left-hand side of =,
287 // we must be seeing a variable name that was never defined
288 if (c == '<' && (haveEquals || havePipe)) {
289 error("Unknown variable name", p, description);
290 }
291
292 lastOpen = c;
293 parenStack.push(new Character((char)c));
294 if (c == '<') {
295 sawVarName = true;
296 }
297 break;
298
299 // if the character is closing punctuation, verify that it matches the
300 // last opening punctuation we saw, and that the brackets contain
301 // something, then pop the stack
302 case '}':
303 case '>':
304 case ']':
305 case ')':
306 char expectedClose = '\u0000';
307 switch (lastOpen) {
308 case '{':
309 expectedClose = '}';
310 break;
311 case '[':
312 expectedClose = ']';
313 break;
885 // that is, if the string ends in "aaaabc", the break will go before the first
886 // "a" rather than the last one. Both of these are limitations in the design
887 // of RuleBasedBreakIterator and not limitations of the rule parser.
888
889 int p = 0;
890 int currentState = 1; // don't use state number 0; 0 means "stop"
891 int lastState = currentState;
892 String pendingChars = "";
893
894 decisionPointStack = new Stack<>();
895 decisionPointList = new Vector<>();
896 loopingStates = new Vector<>();
897 statesToBackfill = new Vector<>();
898
899 short[] state;
900 boolean sawEarlyBreak = false;
901
902 // if we're adding rules to the backward state table, mark the initial state
903 // as a looping state
904 if (!forward) {
905 loopingStates.addElement(new Integer(1));
906 }
907
908 // put the current state on the decision point list before we start
909 decisionPointList.addElement(new Integer(currentState)); // we want currentState to
910 // be 1 here...
911 currentState = tempStateTable.size() - 1; // but after that, we want it to be
912 // 1 less than the state number of the next state
913 while (p < rule.length()) {
914 int c = rule.codePointAt(p);
915 clearLoopingStates = false;
916
917 // this section handles literal characters, escaped characters (which are
918 // effectively literal characters too), the . token, and [] expressions
919 if (c == '['
920 || c == '\\'
921 || Character.isLetter(c)
922 || Character.isDigit(c)
923 || c < ' '
924 || c == '.'
925 || c >= '\u007f') {
926
927 // if we're not on a period, isolate the expression and look up
928 // the corresponding category list
929 if (c != '.') {
961 else {
962 q = p + Character.charCount(c);
963 }
964
965 // look up the category list for the expression and store it
966 // in pendingChars
967 pendingChars = (String)expressions.get(rule.substring(p, q));
968
969 // advance the current position past the expression
970 p = q - Character.charCount(rule.codePointBefore(q));
971 }
972
973 // if the character we're on is a period, we end up down here
974 else {
975 int rowNum = decisionPointList.lastElement().intValue();
976 state = tempStateTable.elementAt(rowNum);
977
978 // if the period is followed by an asterisk, then just set the current
979 // state to loop back on itself
980 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
981 decisionPointList.addElement(new Integer(state[0]));
982 pendingChars = "";
983 ++p;
984 }
985
986 // otherwise, fabricate a category list ("pendingChars") with
987 // every category in it
988 else {
989 StringBuffer temp = new StringBuffer();
990 for (int i = 0; i < numCategories; i++)
991 temp.append((char)(i + 0x100));
992 pendingChars = temp.toString();
993 }
994 }
995
996 // we'll end up in here for all expressions except for .*, which is
997 // special-cased above
998 if (pendingChars.length() != 0) {
999
1000 // if the expression is followed by an asterisk, then push a copy
1001 // of the current desicion point list onto the stack (this is
1002 // the same thing we do on an opening brace)
1003 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') {
1004 @SuppressWarnings("unchecked")
1005 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1006 decisionPointStack.push(clone);
1007 }
1008
1009 // create a new state, add it to the list of states to backfill
1010 // if we have looping states to worry about, set its "don't make
1011 // me an accepting state" flag if we've seen a slash, and add
1012 // it to the end of the state table
1013 int newState = tempStateTable.size();
1014 if (loopingStates.size() != 0) {
1015 statesToBackfill.addElement(new Integer(newState));
1016 }
1017 state = new short[numCategories + 1];
1018 if (sawEarlyBreak) {
1019 state[numCategories] = DONT_LOOP_FLAG;
1020 }
1021 tempStateTable.addElement(state);
1022
1023 // update everybody in the decision point list to point to
1024 // the new state (this also performs all the reconciliation
1025 // needed to make the table deterministic), then clear the
1026 // decision point list
1027 updateStateTable(decisionPointList, pendingChars, (short)newState);
1028 decisionPointList.removeAllElements();
1029
1030 // add all states created since the last literal character we've
1031 // seen to the decision point list
1032 lastState = currentState;
1033 do {
1034 ++currentState;
1035 decisionPointList.addElement(new Integer(currentState));
1036 } while (currentState + 1 < tempStateTable.size());
1037 }
1038 }
1039
1040 // a { marks the beginning of an optional run of characters. Push a
1041 // copy of the current decision point list onto the stack. This saves
1042 // it, preventing it from being affected by whatever's inside the parentheses.
1043 // This decision point list is restored when a } is encountered.
1044 else if (c == '{') {
1045 @SuppressWarnings("unchecked")
1046 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1047 decisionPointStack.push(clone);
1048 }
1049
1050 // a } marks the end of an optional run of characters. Pop the last decision
1051 // point list off the stack and merge it with the current decision point list.
1052 // a * denotes a repeating character or group (* after () is handled separately
1053 // below). In addition to restoring the decision point list, modify the
1054 // current state to point to itself on the appropriate character categories.
1055 else if (c == '}' || c == '*') {
1056 // when there's a *, update the current state to loop back on itself
1057 // on the character categories that caused us to enter this state
1058 if (c == '*') {
1059 for (int i = lastState + 1; i < tempStateTable.size(); i++) {
1060 Vector<Integer> temp = new Vector<>();
1061 temp.addElement(new Integer(i));
1062 updateStateTable(temp, pendingChars, (short)(lastState + 1));
1063 }
1064 }
1065
1066 // pop the top element off the decision point stack and merge
1067 // it with the current decision point list (this causes the divergent
1068 // paths through the state table to come together again on the next
1069 // new state)
1070 Vector<Integer> temp = decisionPointStack.pop();
1071 for (int i = 0; i < decisionPointList.size(); i++)
1072 temp.addElement(decisionPointList.elementAt(i));
1073 decisionPointList = temp;
1074 }
1075
1076 // a ? after a * modifies the behavior of * in cases where there is overlap
1077 // between the set of characters that repeat and the characters which follow.
1078 // Without the ?, all states following the repeating state, up to a state which
1079 // is reached by a character that doesn't overlap, will loop back into the
1080 // repeating state. With the ?, the mark states following the *? DON'T loop
1081 // back into the repeating state. Thus, "[a-z]*xyz" will match the longest
1104 // points before the ( (i.e., the places from which the () can be entered),
1105 // we need to keep track of the entry points in case the expression loops
1106 // (i.e., is followed by *). We do that by creating a dummy state in the
1107 // state table and adding it to the decision point list (BEFORE it's duplicated
1108 // on the stack). Nobody points to this state, so it'll get optimized out
1109 // at the end. It exists only to hold the entry points in case the ()
1110 // expression loops.
1111 else if (c == '(') {
1112
1113 // add a new state to the state table to hold the entry points into
1114 // the () expression
1115 tempStateTable.addElement(new short[numCategories + 1]);
1116
1117 // we have to adjust lastState and currentState to account for the
1118 // new dummy state
1119 lastState = currentState;
1120 ++currentState;
1121
1122 // add the current state to the decision point list (add it at the
1123 // BEGINNING so we can find it later)
1124 decisionPointList.insertElementAt(new Integer(currentState), 0);
1125
1126 // finally, push a copy of the current decision point list onto the
1127 // stack (this keeps track of the active decision point list before
1128 // the () expression), followed by an empty decision point list
1129 // (this will hold the exit points)
1130 @SuppressWarnings("unchecked")
1131 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1132 decisionPointStack.push(clone);
1133 decisionPointStack.push(new Vector<Integer>());
1134 }
1135
1136 // a | separates alternative character sequences in a () expression. When
1137 // a | is encountered, we add the current decision point list to the exit-point
1138 // list, and restore the decision point list to its state prior to the (.
1139 else if (c == '|') {
1140
1141 // pick out the top two decision point lists on the stack
1142 Vector<Integer> oneDown = decisionPointStack.pop();
1143 Vector<Integer> twoDown = decisionPointStack.peek();
1144 decisionPointStack.push(oneDown);
1191 // pop the original decision point list off the stack
1192 Vector<Integer> temp = decisionPointStack.pop();
1193
1194 // we squirreled away the row number of our entry point list
1195 // at the beginning of the original decision point list. Fish
1196 // that state number out and retrieve the entry point list
1197 int tempStateNum = temp.firstElement().intValue();
1198 short[] tempState = tempStateTable.elementAt(tempStateNum);
1199
1200 // merge the original decision point list with the current
1201 // decision point list
1202 for (int i = 0; i < decisionPointList.size(); i++)
1203 temp.addElement(decisionPointList.elementAt(i));
1204 decisionPointList = temp;
1205
1206 // finally, copy every forward reference from the entry point
1207 // list into every state in the new decision point list
1208 for (int i = 0; i < tempState.length; i++) {
1209 if (tempState[i] > tempStateNum) {
1210 updateStateTable(exitPoints,
1211 new Character((char)(i + 0x100)).toString(),
1212 tempState[i]);
1213 }
1214 }
1215
1216 // update lastState and currentState, and throw away the *
1217 lastState = currentState;
1218 currentState = tempStateTable.size() - 1;
1219 ++p;
1220 }
1221 }
1222
1223 // a / marks the position where the break is to go if the character sequence
1224 // matches this rule. We update the flag word of every state on the decision
1225 // point list to mark them as ending states, and take note of the fact that
1226 // we've seen the slash
1227 else if (c == '/') {
1228 sawEarlyBreak = true;
1229 for (int i = 0; i < decisionPointList.size(); i++) {
1230 state = tempStateTable.elementAt(decisionPointList.
1231 elementAt(i).intValue());
1313 * into the state in the state table (we'll call that oldValues). If there's a
1314 * collision (i.e., if the same cell has a nonzero value in both states, and it's
1315 * not the SAME value), then we have to reconcile the collision. We do this by
1316 * creating a new state, adding it to the end of the state table, and using this
1317 * function recursively to merge the original two states into a single, combined
1318 * state. This process may happen recursively (i.e., each successive level may
1319 * involve collisions). To prevent infinite recursion, we keep a log of merge
1320 * operations. Any time we're merging two states we've merged before, we can just
1321 * supply the row number for the result of that merge operation rather than creating
1322 * a new state just like it.
1323 * @param rowNum The row number in the state table of the state to be updated
1324 * @param newValues The state to merge it with.
1325 * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
1326 * (itself a copy of the decision point list from parseRule()). Newly-created
1327 * states get added to the decision point list if their "parents" were on it.
1328 */
1329 private void mergeStates(int rowNum,
1330 short[] newValues,
1331 Vector<Integer> rowsBeingUpdated) {
1332 short[] oldValues = tempStateTable.elementAt(rowNum);
1333 boolean isLoopingState = loopingStates.contains(new Integer(rowNum));
1334
1335 // for each of the cells in the rows we're reconciling, do...
1336 for (int i = 0; i < oldValues.length; i++) {
1337
1338 // if they contain the same value, we don't have to do anything
1339 if (oldValues[i] == newValues[i]) {
1340 continue;
1341 }
1342
1343 // if oldValues is a looping state and the state the current cell points to
1344 // is too, then we can just stomp over the current value of that cell (and
1345 // set the clear-looping-states flag if necessary)
1346 else if (isLoopingState && loopingStates.contains(new Integer(oldValues[i]))) {
1347 if (newValues[i] != 0) {
1348 if (oldValues[i] == 0) {
1349 clearLoopingStates = true;
1350 }
1351 oldValues[i] = newValues[i];
1352 }
1353 }
1354
1355 // if the current cell in oldValues is 0, copy in the corresponding value
1356 // from newValues
1357 else if (oldValues[i] == 0) {
1358 oldValues[i] = newValues[i];
1359 }
1360
1361 // the last column of each row is the flag column. Take care to merge the
1362 // flag words correctly
1363 else if (i == numCategories) {
1364 oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
1365 }
1366
1384
1385 // add this pair of row numbers to the merge list (create it first
1386 // if we haven't created the merge list yet)
1387 if (mergeList == null) {
1388 mergeList = new Vector<>();
1389 }
1390 mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
1391
1392 // create a new row to represent the merged state, and copy the
1393 // contents of oldRow into it, then add it to the end of the
1394 // state table and update the original row (oldValues) to point
1395 // to the new, merged, state
1396 short[] newRow = new short[numCategories + 1];
1397 short[] oldRow = tempStateTable.elementAt(oldRowNum);
1398 System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
1399 tempStateTable.addElement(newRow);
1400 oldValues[i] = (short)combinedRowNum;
1401
1402 // if the decision point list contains either of the parent rows,
1403 // update it to include the new row as well
1404 if ((decisionPointList.contains(new Integer(oldRowNum))
1405 || decisionPointList.contains(new Integer(newRowNum)))
1406 && !decisionPointList.contains(new Integer(combinedRowNum))
1407 ) {
1408 decisionPointList.addElement(new Integer(combinedRowNum));
1409 }
1410
1411 // do the same thing with the list of rows being updated
1412 if ((rowsBeingUpdated.contains(new Integer(oldRowNum))
1413 || rowsBeingUpdated.contains(new Integer(newRowNum)))
1414 && !rowsBeingUpdated.contains(new Integer(combinedRowNum))
1415 ) {
1416 decisionPointList.addElement(new Integer(combinedRowNum));
1417 }
1418 // now (groan) do the same thing for all the entries on the
1419 // decision point stack
1420 for (int k = 0; k < decisionPointStack.size(); k++) {
1421 Vector<Integer> dpl = decisionPointStack.elementAt(k);
1422 if ((dpl.contains(new Integer(oldRowNum))
1423 || dpl.contains(new Integer(newRowNum)))
1424 && !dpl.contains(new Integer(combinedRowNum))
1425 ) {
1426 dpl.addElement(new Integer(combinedRowNum));
1427 }
1428 }
1429
1430 // FINALLY (puff puff puff), call mergeStates() recursively to copy
1431 // the row referred to by newValues into the new row and resolve any
1432 // conflicts that come up at that level
1433 mergeStates(combinedRowNum, tempStateTable.elementAt(
1434 newValues[i]), rowsBeingUpdated);
1435 }
1436 }
1437 }
1438 return;
1439 }
1440
1441 /**
1442 * The merge list is a list of pairs of rows that have been merged somewhere in
1443 * the process of building this state table, along with the row number of the
1444 * row containing the merged state. This function looks up a pair of row numbers
1445 * and returns the row number of the row they combine into. (It returns 0 if
1446 * this pair of rows isn't in the merge list.)
1519 }
1520 statesToBackfill.removeAllElements();
1521 loopingStates.removeAllElements();
1522 }
1523
1524 if (newLoopingStates != null) {
1525 @SuppressWarnings("unchecked")
1526 Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone();
1527 loopingStates = clone;
1528 }
1529 }
1530
1531 /**
1532 * This removes "ending states" and states reachable from them from the
1533 * list of states to backfill.
1534 * @param The row number of the state to remove from the backfill list
1535 */
1536 private void eliminateBackfillStates(int baseState) {
1537
1538 // don't do anything unless this state is actually in the backfill list...
1539 if (statesToBackfill.contains(new Integer(baseState))) {
1540
1541 // if it is, take it out
1542 statesToBackfill.removeElement(new Integer(baseState));
1543
1544 // then go through and recursively call this function for every
1545 // state that the base state points to
1546 short[] state = tempStateTable.elementAt(baseState);
1547 for (int i = 0; i < numCategories; i++) {
1548 if (state[i] != 0) {
1549 eliminateBackfillStates(state[i]);
1550 }
1551 }
1552 }
1553 }
1554
1555 /**
1556 * This function completes the backfilling process by actually doing the
1557 * backfilling on the states that are marked for it
1558 */
1559 private void backfillLoopingStates() {
1560 short[] state;
1561 short[] loopingState = null;
1562 int loopingStateRowNum = 0;
1591 else if (state[j] == DONT_LOOP_FLAG) {
1592 state[j] = 0;
1593 }
1594 }
1595 }
1596 }
1597 }
1598
1599 /**
1600 * This function completes the state-table-building process by doing several
1601 * postprocessing steps and copying everything into its final resting place
1602 * in the iterator itself
1603 * @param forward True if we're working on the forward state table
1604 */
1605 private void finishBuildingStateTable(boolean forward) {
1606 // start by backfilling the looping states
1607 backfillLoopingStates();
1608
1609 int[] rowNumMap = new int[tempStateTable.size()];
1610 Stack<Integer> rowsToFollow = new Stack<>();
1611 rowsToFollow.push(new Integer(1));
1612 rowNumMap[1] = 1;
1613
1614 // determine which states are no longer reachable from the start state
1615 // (the reachable states will have their row numbers in the row number
1616 // map, and the nonreachable states will have zero in the row number map)
1617 while (rowsToFollow.size() != 0) {
1618 int rowNum = rowsToFollow.pop().intValue();
1619 short[] row = tempStateTable.elementAt(rowNum);
1620
1621 for (int i = 0; i < numCategories; i++) {
1622 if (row[i] != 0) {
1623 if (rowNumMap[row[i]] == 0) {
1624 rowNumMap[row[i]] = row[i];
1625 rowsToFollow.push(new Integer(row[i]));
1626 }
1627 }
1628 }
1629 }
1630
1631 boolean madeChange;
1632 int newRowNum;
1633
1634 // algorithm for minimizing the number of states in the table adapted from
1635 // Aho & Ullman, "Principles of Compiler Design"
1636 // The basic idea here is to organize the states into classes. When we're done,
1637 // all states in the same class can be considered identical and all but one eliminated.
1638
1639 // initially assign states to classes based on the number of populated cells they
1640 // contain (the class number is the number of populated cells)
1641 int[] stateClasses = new int[tempStateTable.size()];
1642 int nextClass = numCategories + 1;
1643 short[] state1, state2;
1644 for (int i = 1; i < stateClasses.length; i++) {
1645 if (rowNumMap[i] == 0) {
|
1 /*
2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
273 // if the character is opening punctuation, verify that no nesting
274 // rules are broken, and push the character onto the stack
275 case '{':
276 case '<':
277 case '[':
278 case '(':
279 if (lastOpen == '<') {
280 error("Can't nest brackets inside <>", p, description);
281 }
282 if (lastOpen == '[' && c != '[') {
283 error("Can't nest anything in [] but []", p, description);
284 }
285
286 // if we see < anywhere except on the left-hand side of =,
287 // we must be seeing a variable name that was never defined
288 if (c == '<' && (haveEquals || havePipe)) {
289 error("Unknown variable name", p, description);
290 }
291
292 lastOpen = c;
293 parenStack.push(Character.valueOf((char)c));
294 if (c == '<') {
295 sawVarName = true;
296 }
297 break;
298
299 // if the character is closing punctuation, verify that it matches the
300 // last opening punctuation we saw, and that the brackets contain
301 // something, then pop the stack
302 case '}':
303 case '>':
304 case ']':
305 case ')':
306 char expectedClose = '\u0000';
307 switch (lastOpen) {
308 case '{':
309 expectedClose = '}';
310 break;
311 case '[':
312 expectedClose = ']';
313 break;
885 // that is, if the string ends in "aaaabc", the break will go before the first
886 // "a" rather than the last one. Both of these are limitations in the design
887 // of RuleBasedBreakIterator and not limitations of the rule parser.
888
889 int p = 0;
890 int currentState = 1; // don't use state number 0; 0 means "stop"
891 int lastState = currentState;
892 String pendingChars = "";
893
894 decisionPointStack = new Stack<>();
895 decisionPointList = new Vector<>();
896 loopingStates = new Vector<>();
897 statesToBackfill = new Vector<>();
898
899 short[] state;
900 boolean sawEarlyBreak = false;
901
902 // if we're adding rules to the backward state table, mark the initial state
903 // as a looping state
904 if (!forward) {
905 loopingStates.addElement(Integer.valueOf(1));
906 }
907
908 // put the current state on the decision point list before we start
909 decisionPointList.addElement(Integer.valueOf(currentState)); // we want currentState to
910 // be 1 here...
911 currentState = tempStateTable.size() - 1; // but after that, we want it to be
912 // 1 less than the state number of the next state
913 while (p < rule.length()) {
914 int c = rule.codePointAt(p);
915 clearLoopingStates = false;
916
917 // this section handles literal characters, escaped characters (which are
918 // effectively literal characters too), the . token, and [] expressions
919 if (c == '['
920 || c == '\\'
921 || Character.isLetter(c)
922 || Character.isDigit(c)
923 || c < ' '
924 || c == '.'
925 || c >= '\u007f') {
926
927 // if we're not on a period, isolate the expression and look up
928 // the corresponding category list
929 if (c != '.') {
961 else {
962 q = p + Character.charCount(c);
963 }
964
965 // look up the category list for the expression and store it
966 // in pendingChars
967 pendingChars = (String)expressions.get(rule.substring(p, q));
968
969 // advance the current position past the expression
970 p = q - Character.charCount(rule.codePointBefore(q));
971 }
972
973 // if the character we're on is a period, we end up down here
974 else {
975 int rowNum = decisionPointList.lastElement().intValue();
976 state = tempStateTable.elementAt(rowNum);
977
978 // if the period is followed by an asterisk, then just set the current
979 // state to loop back on itself
980 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
981 decisionPointList.addElement(Integer.valueOf(state[0]));
982 pendingChars = "";
983 ++p;
984 }
985
986 // otherwise, fabricate a category list ("pendingChars") with
987 // every category in it
988 else {
989 StringBuffer temp = new StringBuffer();
990 for (int i = 0; i < numCategories; i++)
991 temp.append((char)(i + 0x100));
992 pendingChars = temp.toString();
993 }
994 }
995
996 // we'll end up in here for all expressions except for .*, which is
997 // special-cased above
998 if (pendingChars.length() != 0) {
999
1000 // if the expression is followed by an asterisk, then push a copy
1001 // of the current desicion point list onto the stack (this is
1002 // the same thing we do on an opening brace)
1003 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') {
1004 @SuppressWarnings("unchecked")
1005 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1006 decisionPointStack.push(clone);
1007 }
1008
1009 // create a new state, add it to the list of states to backfill
1010 // if we have looping states to worry about, set its "don't make
1011 // me an accepting state" flag if we've seen a slash, and add
1012 // it to the end of the state table
1013 int newState = tempStateTable.size();
1014 if (loopingStates.size() != 0) {
1015 statesToBackfill.addElement(Integer.valueOf(newState));
1016 }
1017 state = new short[numCategories + 1];
1018 if (sawEarlyBreak) {
1019 state[numCategories] = DONT_LOOP_FLAG;
1020 }
1021 tempStateTable.addElement(state);
1022
1023 // update everybody in the decision point list to point to
1024 // the new state (this also performs all the reconciliation
1025 // needed to make the table deterministic), then clear the
1026 // decision point list
1027 updateStateTable(decisionPointList, pendingChars, (short)newState);
1028 decisionPointList.removeAllElements();
1029
1030 // add all states created since the last literal character we've
1031 // seen to the decision point list
1032 lastState = currentState;
1033 do {
1034 ++currentState;
1035 decisionPointList.addElement(Integer.valueOf(currentState));
1036 } while (currentState + 1 < tempStateTable.size());
1037 }
1038 }
1039
1040 // a { marks the beginning of an optional run of characters. Push a
1041 // copy of the current decision point list onto the stack. This saves
1042 // it, preventing it from being affected by whatever's inside the parentheses.
1043 // This decision point list is restored when a } is encountered.
1044 else if (c == '{') {
1045 @SuppressWarnings("unchecked")
1046 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1047 decisionPointStack.push(clone);
1048 }
1049
1050 // a } marks the end of an optional run of characters. Pop the last decision
1051 // point list off the stack and merge it with the current decision point list.
1052 // a * denotes a repeating character or group (* after () is handled separately
1053 // below). In addition to restoring the decision point list, modify the
1054 // current state to point to itself on the appropriate character categories.
1055 else if (c == '}' || c == '*') {
1056 // when there's a *, update the current state to loop back on itself
1057 // on the character categories that caused us to enter this state
1058 if (c == '*') {
1059 for (int i = lastState + 1; i < tempStateTable.size(); i++) {
1060 Vector<Integer> temp = new Vector<>();
1061 temp.addElement(Integer.valueOf(i));
1062 updateStateTable(temp, pendingChars, (short)(lastState + 1));
1063 }
1064 }
1065
1066 // pop the top element off the decision point stack and merge
1067 // it with the current decision point list (this causes the divergent
1068 // paths through the state table to come together again on the next
1069 // new state)
1070 Vector<Integer> temp = decisionPointStack.pop();
1071 for (int i = 0; i < decisionPointList.size(); i++)
1072 temp.addElement(decisionPointList.elementAt(i));
1073 decisionPointList = temp;
1074 }
1075
1076 // a ? after a * modifies the behavior of * in cases where there is overlap
1077 // between the set of characters that repeat and the characters which follow.
1078 // Without the ?, all states following the repeating state, up to a state which
1079 // is reached by a character that doesn't overlap, will loop back into the
1080 // repeating state. With the ?, the mark states following the *? DON'T loop
1081 // back into the repeating state. Thus, "[a-z]*xyz" will match the longest
1104 // points before the ( (i.e., the places from which the () can be entered),
1105 // we need to keep track of the entry points in case the expression loops
1106 // (i.e., is followed by *). We do that by creating a dummy state in the
1107 // state table and adding it to the decision point list (BEFORE it's duplicated
1108 // on the stack). Nobody points to this state, so it'll get optimized out
1109 // at the end. It exists only to hold the entry points in case the ()
1110 // expression loops.
1111 else if (c == '(') {
1112
1113 // add a new state to the state table to hold the entry points into
1114 // the () expression
1115 tempStateTable.addElement(new short[numCategories + 1]);
1116
1117 // we have to adjust lastState and currentState to account for the
1118 // new dummy state
1119 lastState = currentState;
1120 ++currentState;
1121
1122 // add the current state to the decision point list (add it at the
1123 // BEGINNING so we can find it later)
1124 decisionPointList.insertElementAt(Integer.valueOf(currentState), 0);
1125
1126 // finally, push a copy of the current decision point list onto the
1127 // stack (this keeps track of the active decision point list before
1128 // the () expression), followed by an empty decision point list
1129 // (this will hold the exit points)
1130 @SuppressWarnings("unchecked")
1131 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1132 decisionPointStack.push(clone);
1133 decisionPointStack.push(new Vector<Integer>());
1134 }
1135
1136 // a | separates alternative character sequences in a () expression. When
1137 // a | is encountered, we add the current decision point list to the exit-point
1138 // list, and restore the decision point list to its state prior to the (.
1139 else if (c == '|') {
1140
1141 // pick out the top two decision point lists on the stack
1142 Vector<Integer> oneDown = decisionPointStack.pop();
1143 Vector<Integer> twoDown = decisionPointStack.peek();
1144 decisionPointStack.push(oneDown);
1191 // pop the original decision point list off the stack
1192 Vector<Integer> temp = decisionPointStack.pop();
1193
1194 // we squirreled away the row number of our entry point list
1195 // at the beginning of the original decision point list. Fish
1196 // that state number out and retrieve the entry point list
1197 int tempStateNum = temp.firstElement().intValue();
1198 short[] tempState = tempStateTable.elementAt(tempStateNum);
1199
1200 // merge the original decision point list with the current
1201 // decision point list
1202 for (int i = 0; i < decisionPointList.size(); i++)
1203 temp.addElement(decisionPointList.elementAt(i));
1204 decisionPointList = temp;
1205
1206 // finally, copy every forward reference from the entry point
1207 // list into every state in the new decision point list
1208 for (int i = 0; i < tempState.length; i++) {
1209 if (tempState[i] > tempStateNum) {
1210 updateStateTable(exitPoints,
1211 Character.valueOf((char)(i + 0x100)).toString(),
1212 tempState[i]);
1213 }
1214 }
1215
1216 // update lastState and currentState, and throw away the *
1217 lastState = currentState;
1218 currentState = tempStateTable.size() - 1;
1219 ++p;
1220 }
1221 }
1222
1223 // a / marks the position where the break is to go if the character sequence
1224 // matches this rule. We update the flag word of every state on the decision
1225 // point list to mark them as ending states, and take note of the fact that
1226 // we've seen the slash
1227 else if (c == '/') {
1228 sawEarlyBreak = true;
1229 for (int i = 0; i < decisionPointList.size(); i++) {
1230 state = tempStateTable.elementAt(decisionPointList.
1231 elementAt(i).intValue());
1313 * into the state in the state table (we'll call that oldValues). If there's a
1314 * collision (i.e., if the same cell has a nonzero value in both states, and it's
1315 * not the SAME value), then we have to reconcile the collision. We do this by
1316 * creating a new state, adding it to the end of the state table, and using this
1317 * function recursively to merge the original two states into a single, combined
1318 * state. This process may happen recursively (i.e., each successive level may
1319 * involve collisions). To prevent infinite recursion, we keep a log of merge
1320 * operations. Any time we're merging two states we've merged before, we can just
1321 * supply the row number for the result of that merge operation rather than creating
1322 * a new state just like it.
1323 * @param rowNum The row number in the state table of the state to be updated
1324 * @param newValues The state to merge it with.
1325 * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
1326 * (itself a copy of the decision point list from parseRule()). Newly-created
1327 * states get added to the decision point list if their "parents" were on it.
1328 */
1329 private void mergeStates(int rowNum,
1330 short[] newValues,
1331 Vector<Integer> rowsBeingUpdated) {
1332 short[] oldValues = tempStateTable.elementAt(rowNum);
1333 boolean isLoopingState = loopingStates.contains(Integer.valueOf(rowNum));
1334
1335 // for each of the cells in the rows we're reconciling, do...
1336 for (int i = 0; i < oldValues.length; i++) {
1337
1338 // if they contain the same value, we don't have to do anything
1339 if (oldValues[i] == newValues[i]) {
1340 continue;
1341 }
1342
1343 // if oldValues is a looping state and the state the current cell points to
1344 // is too, then we can just stomp over the current value of that cell (and
1345 // set the clear-looping-states flag if necessary)
1346 else if (isLoopingState && loopingStates.contains(Integer.valueOf(oldValues[i]))) {
1347 if (newValues[i] != 0) {
1348 if (oldValues[i] == 0) {
1349 clearLoopingStates = true;
1350 }
1351 oldValues[i] = newValues[i];
1352 }
1353 }
1354
1355 // if the current cell in oldValues is 0, copy in the corresponding value
1356 // from newValues
1357 else if (oldValues[i] == 0) {
1358 oldValues[i] = newValues[i];
1359 }
1360
1361 // the last column of each row is the flag column. Take care to merge the
1362 // flag words correctly
1363 else if (i == numCategories) {
1364 oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
1365 }
1366
1384
1385 // add this pair of row numbers to the merge list (create it first
1386 // if we haven't created the merge list yet)
1387 if (mergeList == null) {
1388 mergeList = new Vector<>();
1389 }
1390 mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
1391
1392 // create a new row to represent the merged state, and copy the
1393 // contents of oldRow into it, then add it to the end of the
1394 // state table and update the original row (oldValues) to point
1395 // to the new, merged, state
1396 short[] newRow = new short[numCategories + 1];
1397 short[] oldRow = tempStateTable.elementAt(oldRowNum);
1398 System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
1399 tempStateTable.addElement(newRow);
1400 oldValues[i] = (short)combinedRowNum;
1401
1402 // if the decision point list contains either of the parent rows,
1403 // update it to include the new row as well
1404 if ((decisionPointList.contains(Integer.valueOf(oldRowNum))
1405 || decisionPointList.contains(Integer.valueOf(newRowNum)))
1406 && !decisionPointList.contains(Integer.valueOf(combinedRowNum))
1407 ) {
1408 decisionPointList.addElement(Integer.valueOf(combinedRowNum));
1409 }
1410
1411 // do the same thing with the list of rows being updated
1412 if ((rowsBeingUpdated.contains(Integer.valueOf(oldRowNum))
1413 || rowsBeingUpdated.contains(Integer.valueOf(newRowNum)))
1414 && !rowsBeingUpdated.contains(Integer.valueOf(combinedRowNum))
1415 ) {
1416 decisionPointList.addElement(Integer.valueOf(combinedRowNum));
1417 }
1418 // now (groan) do the same thing for all the entries on the
1419 // decision point stack
1420 for (int k = 0; k < decisionPointStack.size(); k++) {
1421 Vector<Integer> dpl = decisionPointStack.elementAt(k);
1422 if ((dpl.contains(Integer.valueOf(oldRowNum))
1423 || dpl.contains(Integer.valueOf(newRowNum)))
1424 && !dpl.contains(Integer.valueOf(combinedRowNum))
1425 ) {
1426 dpl.addElement(Integer.valueOf(combinedRowNum));
1427 }
1428 }
1429
1430 // FINALLY (puff puff puff), call mergeStates() recursively to copy
1431 // the row referred to by newValues into the new row and resolve any
1432 // conflicts that come up at that level
1433 mergeStates(combinedRowNum, tempStateTable.elementAt(
1434 newValues[i]), rowsBeingUpdated);
1435 }
1436 }
1437 }
1438 return;
1439 }
1440
1441 /**
1442 * The merge list is a list of pairs of rows that have been merged somewhere in
1443 * the process of building this state table, along with the row number of the
1444 * row containing the merged state. This function looks up a pair of row numbers
1445 * and returns the row number of the row they combine into. (It returns 0 if
1446 * this pair of rows isn't in the merge list.)
1519 }
1520 statesToBackfill.removeAllElements();
1521 loopingStates.removeAllElements();
1522 }
1523
1524 if (newLoopingStates != null) {
1525 @SuppressWarnings("unchecked")
1526 Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone();
1527 loopingStates = clone;
1528 }
1529 }
1530
1531 /**
1532 * This removes "ending states" and states reachable from them from the
1533 * list of states to backfill.
1534 * @param The row number of the state to remove from the backfill list
1535 */
1536 private void eliminateBackfillStates(int baseState) {
1537
1538 // don't do anything unless this state is actually in the backfill list...
1539 if (statesToBackfill.contains(Integer.valueOf(baseState))) {
1540
1541 // if it is, take it out
1542 statesToBackfill.removeElement(Integer.valueOf(baseState));
1543
1544 // then go through and recursively call this function for every
1545 // state that the base state points to
1546 short[] state = tempStateTable.elementAt(baseState);
1547 for (int i = 0; i < numCategories; i++) {
1548 if (state[i] != 0) {
1549 eliminateBackfillStates(state[i]);
1550 }
1551 }
1552 }
1553 }
1554
1555 /**
1556 * This function completes the backfilling process by actually doing the
1557 * backfilling on the states that are marked for it
1558 */
1559 private void backfillLoopingStates() {
1560 short[] state;
1561 short[] loopingState = null;
1562 int loopingStateRowNum = 0;
1591 else if (state[j] == DONT_LOOP_FLAG) {
1592 state[j] = 0;
1593 }
1594 }
1595 }
1596 }
1597 }
1598
1599 /**
1600 * This function completes the state-table-building process by doing several
1601 * postprocessing steps and copying everything into its final resting place
1602 * in the iterator itself
1603 * @param forward True if we're working on the forward state table
1604 */
1605 private void finishBuildingStateTable(boolean forward) {
1606 // start by backfilling the looping states
1607 backfillLoopingStates();
1608
1609 int[] rowNumMap = new int[tempStateTable.size()];
1610 Stack<Integer> rowsToFollow = new Stack<>();
1611 rowsToFollow.push(Integer.valueOf(1));
1612 rowNumMap[1] = 1;
1613
1614 // determine which states are no longer reachable from the start state
1615 // (the reachable states will have their row numbers in the row number
1616 // map, and the nonreachable states will have zero in the row number map)
1617 while (rowsToFollow.size() != 0) {
1618 int rowNum = rowsToFollow.pop().intValue();
1619 short[] row = tempStateTable.elementAt(rowNum);
1620
1621 for (int i = 0; i < numCategories; i++) {
1622 if (row[i] != 0) {
1623 if (rowNumMap[row[i]] == 0) {
1624 rowNumMap[row[i]] = row[i];
1625 rowsToFollow.push(Integer.valueOf(row[i]));
1626 }
1627 }
1628 }
1629 }
1630
1631 boolean madeChange;
1632 int newRowNum;
1633
1634 // algorithm for minimizing the number of states in the table adapted from
1635 // Aho & Ullman, "Principles of Compiler Design"
1636 // The basic idea here is to organize the states into classes. When we're done,
1637 // all states in the same class can be considered identical and all but one eliminated.
1638
1639 // initially assign states to classes based on the number of populated cells they
1640 // contain (the class number is the number of populated cells)
1641 int[] stateClasses = new int[tempStateTable.size()];
1642 int nextClass = numCategories + 1;
1643 short[] state1, state2;
1644 for (int i = 1; i < stateClasses.length; i++) {
1645 if (rowNumMap[i] == 0) {
|