< prev index next >

make/jdk/src/classes/build/tools/generatebreakiteratordata/RuleBasedBreakIteratorBuilder.java

Print this page


   1 /*
   2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


 273                 // if the character is opening punctuation, verify that no nesting
 274                 // rules are broken, and push the character onto the stack
 275                 case '{':
 276                 case '<':
 277                 case '[':
 278                 case '(':
 279                     if (lastOpen == '<') {
 280                         error("Can't nest brackets inside <>", p, description);
 281                     }
 282                     if (lastOpen == '[' && c != '[') {
 283                         error("Can't nest anything in [] but []", p, description);
 284                     }
 285 
 286                     // if we see < anywhere except on the left-hand side of =,
 287                     // we must be seeing a variable name that was never defined
 288                     if (c == '<' && (haveEquals || havePipe)) {
 289                         error("Unknown variable name", p, description);
 290                     }
 291 
 292                     lastOpen = c;
 293                     parenStack.push(new Character((char)c));
 294                     if (c == '<') {
 295                         sawVarName = true;
 296                     }
 297                     break;
 298 
 299                 // if the character is closing punctuation, verify that it matches the
 300                 // last opening punctuation we saw, and that the brackets contain
 301                 // something, then pop the stack
 302                 case '}':
 303                 case '>':
 304                 case ']':
 305                 case ')':
 306                     char expectedClose = '\u0000';
 307                     switch (lastOpen) {
 308                         case '{':
 309                             expectedClose = '}';
 310                             break;
 311                         case '[':
 312                             expectedClose = ']';
 313                             break;


 885         //   that is, if the string ends in "aaaabc", the break will go before the first
 886         //   "a" rather than the last one.  Both of these are limitations in the design
 887         //   of RuleBasedBreakIterator and not limitations of the rule parser.
 888 
 889         int p = 0;
 890         int currentState = 1;   // don't use state number 0; 0 means "stop"
 891         int lastState = currentState;
 892         String pendingChars = "";
 893 
 894         decisionPointStack = new Stack<>();
 895         decisionPointList = new Vector<>();
 896         loopingStates = new Vector<>();
 897         statesToBackfill = new Vector<>();
 898 
 899         short[] state;
 900         boolean sawEarlyBreak = false;
 901 
 902         // if we're adding rules to the backward state table, mark the initial state
 903         // as a looping state
 904         if (!forward) {
 905             loopingStates.addElement(new Integer(1));
 906         }
 907 
 908         // put the current state on the decision point list before we start
 909         decisionPointList.addElement(new Integer(currentState)); // we want currentState to
 910                                                                  // be 1 here...
 911         currentState = tempStateTable.size() - 1;   // but after that, we want it to be
 912                                                     // 1 less than the state number of the next state
 913         while (p < rule.length()) {
 914             int c = rule.codePointAt(p);
 915             clearLoopingStates = false;
 916 
 917             // this section handles literal characters, escaped characters (which are
 918             // effectively literal characters too), the . token, and [] expressions
 919             if (c == '['
 920                 || c == '\\'
 921                 || Character.isLetter(c)
 922                 || Character.isDigit(c)
 923                 || c < ' '
 924                 || c == '.'
 925                 || c >= '\u007f') {
 926 
 927                 // if we're not on a period, isolate the expression and look up
 928                 // the corresponding category list
 929                 if (c != '.') {


 961                     else {
 962                         q = p + Character.charCount(c);
 963                     }
 964 
 965                     // look up the category list for the expression and store it
 966                     // in pendingChars
 967                     pendingChars = (String)expressions.get(rule.substring(p, q));
 968 
 969                     // advance the current position past the expression
 970                     p = q - Character.charCount(rule.codePointBefore(q));
 971                 }
 972 
 973                 // if the character we're on is a period, we end up down here
 974                 else {
 975                     int rowNum = decisionPointList.lastElement().intValue();
 976                     state = tempStateTable.elementAt(rowNum);
 977 
 978                     // if the period is followed by an asterisk, then just set the current
 979                     // state to loop back on itself
 980                     if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
 981                         decisionPointList.addElement(new Integer(state[0]));
 982                         pendingChars = "";
 983                         ++p;
 984                     }
 985 
 986                     // otherwise, fabricate a category list ("pendingChars") with
 987                     // every category in it
 988                     else {
 989                         StringBuffer temp = new StringBuffer();
 990                         for (int i = 0; i < numCategories; i++)
 991                             temp.append((char)(i + 0x100));
 992                         pendingChars = temp.toString();
 993                     }
 994                 }
 995 
 996                 // we'll end up in here for all expressions except for .*, which is
 997                 // special-cased above
 998                 if (pendingChars.length() != 0) {
 999 
1000                     // if the expression is followed by an asterisk, then push a copy
1001                     // of the current desicion point list onto the stack (this is
1002                     // the same thing we do on an opening brace)
1003                     if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') {
1004                         @SuppressWarnings("unchecked")
1005                         Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1006                         decisionPointStack.push(clone);
1007                     }
1008 
1009                     // create a new state, add it to the list of states to backfill
1010                     // if we have looping states to worry about, set its "don't make
1011                     // me an accepting state" flag if we've seen a slash, and add
1012                     // it to the end of the state table
1013                     int newState = tempStateTable.size();
1014                     if (loopingStates.size() != 0) {
1015                         statesToBackfill.addElement(new Integer(newState));
1016                     }
1017                     state = new short[numCategories + 1];
1018                     if (sawEarlyBreak) {
1019                         state[numCategories] = DONT_LOOP_FLAG;
1020                     }
1021                     tempStateTable.addElement(state);
1022 
1023                     // update everybody in the decision point list to point to
1024                     // the new state (this also performs all the reconciliation
1025                     // needed to make the table deterministic), then clear the
1026                     // decision point list
1027                     updateStateTable(decisionPointList, pendingChars, (short)newState);
1028                     decisionPointList.removeAllElements();
1029 
1030                     // add all states created since the last literal character we've
1031                     // seen to the decision point list
1032                     lastState = currentState;
1033                     do {
1034                         ++currentState;
1035                         decisionPointList.addElement(new Integer(currentState));
1036                     } while (currentState + 1 < tempStateTable.size());
1037                 }
1038             }
1039 
1040             // a { marks the beginning of an optional run of characters.  Push a
1041             // copy of the current decision point list onto the stack.  This saves
1042             // it, preventing it from being affected by whatever's inside the parentheses.
1043             // This decision point list is restored when a } is encountered.
1044             else if (c == '{') {
1045                 @SuppressWarnings("unchecked")
1046                 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1047                 decisionPointStack.push(clone);
1048             }
1049 
1050             // a } marks the end of an optional run of characters.  Pop the last decision
1051             // point list off the stack and merge it with the current decision point list.
1052             // a * denotes a repeating character or group (* after () is handled separately
1053             // below).  In addition to restoring the decision point list, modify the
1054             // current state to point to itself on the appropriate character categories.
1055             else if (c == '}' || c == '*') {
1056                 // when there's a *, update the current state to loop back on itself
1057                 // on the character categories that caused us to enter this state
1058                 if (c == '*') {
1059                     for (int i = lastState + 1; i < tempStateTable.size(); i++) {
1060                         Vector<Integer> temp = new Vector<>();
1061                         temp.addElement(new Integer(i));
1062                         updateStateTable(temp, pendingChars, (short)(lastState + 1));
1063                     }
1064                 }
1065 
1066                 // pop the top element off the decision point stack and merge
1067                 // it with the current decision point list (this causes the divergent
1068                 // paths through the state table to come together again on the next
1069                 // new state)
1070                 Vector<Integer> temp = decisionPointStack.pop();
1071                 for (int i = 0; i < decisionPointList.size(); i++)
1072                     temp.addElement(decisionPointList.elementAt(i));
1073                 decisionPointList = temp;
1074             }
1075 
1076             // a ? after a * modifies the behavior of * in cases where there is overlap
1077             // between the set of characters that repeat and the characters which follow.
1078             // Without the ?, all states following the repeating state, up to a state which
1079             // is reached by a character that doesn't overlap, will loop back into the
1080             // repeating state.  With the ?, the mark states following the *? DON'T loop
1081             // back into the repeating state.  Thus, "[a-z]*xyz" will match the longest


1104             // points before the ( (i.e., the places from which the () can be entered),
1105             // we need to keep track of the entry points in case the expression loops
1106             // (i.e., is followed by *).  We do that by creating a dummy state in the
1107             // state table and adding it to the decision point list (BEFORE it's duplicated
1108             // on the stack).  Nobody points to this state, so it'll get optimized out
1109             // at the end.  It exists only to hold the entry points in case the ()
1110             // expression loops.
1111             else if (c == '(') {
1112 
1113                 // add a new state to the state table to hold the entry points into
1114                 // the () expression
1115                 tempStateTable.addElement(new short[numCategories + 1]);
1116 
1117                 // we have to adjust lastState and currentState to account for the
1118                 // new dummy state
1119                 lastState = currentState;
1120                 ++currentState;
1121 
1122                 // add the current state to the decision point list (add it at the
1123                 // BEGINNING so we can find it later)
1124                 decisionPointList.insertElementAt(new Integer(currentState), 0);
1125 
1126                 // finally, push a copy of the current decision point list onto the
1127                 // stack (this keeps track of the active decision point list before
1128                 // the () expression), followed by an empty decision point list
1129                 // (this will hold the exit points)
1130                 @SuppressWarnings("unchecked")
1131                 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1132                 decisionPointStack.push(clone);
1133                 decisionPointStack.push(new Vector<Integer>());
1134             }
1135 
1136             // a | separates alternative character sequences in a () expression.  When
1137             // a | is encountered, we add the current decision point list to the exit-point
1138             // list, and restore the decision point list to its state prior to the (.
1139             else if (c == '|') {
1140 
1141                 // pick out the top two decision point lists on the stack
1142                 Vector<Integer> oneDown = decisionPointStack.pop();
1143                 Vector<Integer> twoDown = decisionPointStack.peek();
1144                 decisionPointStack.push(oneDown);


1191                     // pop the original decision point list off the stack
1192                     Vector<Integer> temp = decisionPointStack.pop();
1193 
1194                     // we squirreled away the row number of our entry point list
1195                     // at the beginning of the original decision point list.  Fish
1196                     // that state number out and retrieve the entry point list
1197                     int tempStateNum = temp.firstElement().intValue();
1198                     short[] tempState = tempStateTable.elementAt(tempStateNum);
1199 
1200                     // merge the original decision point list with the current
1201                     // decision point list
1202                     for (int i = 0; i < decisionPointList.size(); i++)
1203                         temp.addElement(decisionPointList.elementAt(i));
1204                     decisionPointList = temp;
1205 
1206                     // finally, copy every forward reference from the entry point
1207                     // list into every state in the new decision point list
1208                     for (int i = 0; i < tempState.length; i++) {
1209                         if (tempState[i] > tempStateNum) {
1210                             updateStateTable(exitPoints,
1211                                              new Character((char)(i + 0x100)).toString(),
1212                                              tempState[i]);
1213                         }
1214                     }
1215 
1216                     // update lastState and currentState, and throw away the *
1217                     lastState = currentState;
1218                     currentState = tempStateTable.size() - 1;
1219                     ++p;
1220                 }
1221             }
1222 
1223             // a / marks the position where the break is to go if the character sequence
1224             // matches this rule.  We update the flag word of every state on the decision
1225             // point list to mark them as ending states, and take note of the fact that
1226             // we've seen the slash
1227             else if (c == '/') {
1228                 sawEarlyBreak = true;
1229                 for (int i = 0; i < decisionPointList.size(); i++) {
1230                     state = tempStateTable.elementAt(decisionPointList.
1231                                     elementAt(i).intValue());


1313      * into the state in the state table (we'll call that oldValues).  If there's a
1314      * collision (i.e., if the same cell has a nonzero value in both states, and it's
1315      * not the SAME value), then we have to reconcile the collision.  We do this by
1316      * creating a new state, adding it to the end of the state table, and using this
1317      * function recursively to merge the original two states into a single, combined
1318      * state.  This process may happen recursively (i.e., each successive level may
1319      * involve collisions).  To prevent infinite recursion, we keep a log of merge
1320      * operations.  Any time we're merging two states we've merged before, we can just
1321      * supply the row number for the result of that merge operation rather than creating
1322      * a new state just like it.
1323      * @param rowNum The row number in the state table of the state to be updated
1324      * @param newValues The state to merge it with.
1325      * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
1326      * (itself a copy of the decision point list from parseRule()).  Newly-created
1327      * states get added to the decision point list if their "parents" were on it.
1328      */
1329     private void mergeStates(int rowNum,
1330                              short[] newValues,
1331                              Vector<Integer> rowsBeingUpdated) {
1332         short[] oldValues = tempStateTable.elementAt(rowNum);
1333         boolean isLoopingState = loopingStates.contains(new Integer(rowNum));
1334 
1335         // for each of the cells in the rows we're reconciling, do...
1336         for (int i = 0; i < oldValues.length; i++) {
1337 
1338             // if they contain the same value, we don't have to do anything
1339             if (oldValues[i] == newValues[i]) {
1340                 continue;
1341             }
1342 
1343             // if oldValues is a looping state and the state the current cell points to
1344             // is too, then we can just stomp over the current value of that cell (and
1345             // set the clear-looping-states flag if necessary)
1346             else if (isLoopingState && loopingStates.contains(new Integer(oldValues[i]))) {
1347                 if (newValues[i] != 0) {
1348                     if (oldValues[i] == 0) {
1349                         clearLoopingStates = true;
1350                     }
1351                     oldValues[i] = newValues[i];
1352                 }
1353             }
1354 
1355             // if the current cell in oldValues is 0, copy in the corresponding value
1356             // from newValues
1357             else if (oldValues[i] == 0) {
1358                 oldValues[i] = newValues[i];
1359             }
1360 
1361             // the last column of each row is the flag column.  Take care to merge the
1362             // flag words correctly
1363             else if (i == numCategories) {
1364                 oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
1365             }
1366 


1384 
1385                     // add this pair of row numbers to the merge list (create it first
1386                     // if we haven't created the merge list yet)
1387                     if (mergeList == null) {
1388                         mergeList = new Vector<>();
1389                     }
1390                     mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
1391 
1392                     // create a new row to represent the merged state, and copy the
1393                     // contents of oldRow into it, then add it to the end of the
1394                     // state table and update the original row (oldValues) to point
1395                     // to the new, merged, state
1396                     short[] newRow = new short[numCategories + 1];
1397                     short[] oldRow = tempStateTable.elementAt(oldRowNum);
1398                     System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
1399                     tempStateTable.addElement(newRow);
1400                     oldValues[i] = (short)combinedRowNum;
1401 
1402                     // if the decision point list contains either of the parent rows,
1403                     // update it to include the new row as well
1404                     if ((decisionPointList.contains(new Integer(oldRowNum))
1405                             || decisionPointList.contains(new Integer(newRowNum)))
1406                         && !decisionPointList.contains(new Integer(combinedRowNum))
1407                     ) {
1408                         decisionPointList.addElement(new Integer(combinedRowNum));
1409                     }
1410 
1411                     // do the same thing with the list of rows being updated
1412                     if ((rowsBeingUpdated.contains(new Integer(oldRowNum))
1413                             || rowsBeingUpdated.contains(new Integer(newRowNum)))
1414                         && !rowsBeingUpdated.contains(new Integer(combinedRowNum))
1415                     ) {
1416                         decisionPointList.addElement(new Integer(combinedRowNum));
1417                     }
1418                     // now (groan) do the same thing for all the entries on the
1419                     // decision point stack
1420                     for (int k = 0; k < decisionPointStack.size(); k++) {
1421                         Vector<Integer> dpl = decisionPointStack.elementAt(k);
1422                         if ((dpl.contains(new Integer(oldRowNum))
1423                                 || dpl.contains(new Integer(newRowNum)))
1424                             && !dpl.contains(new Integer(combinedRowNum))
1425                         ) {
1426                             dpl.addElement(new Integer(combinedRowNum));
1427                         }
1428                     }
1429 
1430                     // FINALLY (puff puff puff), call mergeStates() recursively to copy
1431                     // the row referred to by newValues into the new row and resolve any
1432                     // conflicts that come up at that level
1433                     mergeStates(combinedRowNum, tempStateTable.elementAt(
1434                                     newValues[i]), rowsBeingUpdated);
1435                 }
1436             }
1437         }
1438         return;
1439     }
1440 
1441     /**
1442      * The merge list is a list of pairs of rows that have been merged somewhere in
1443      * the process of building this state table, along with the row number of the
1444      * row containing the merged state.  This function looks up a pair of row numbers
1445      * and returns the row number of the row they combine into.  (It returns 0 if
1446      * this pair of rows isn't in the merge list.)


1519             }
1520             statesToBackfill.removeAllElements();
1521             loopingStates.removeAllElements();
1522         }
1523 
1524         if (newLoopingStates != null) {
1525             @SuppressWarnings("unchecked")
1526             Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone();
1527             loopingStates = clone;
1528         }
1529     }
1530 
1531     /**
1532      * This removes "ending states" and states reachable from them from the
1533      * list of states to backfill.
1534      * @param The row number of the state to remove from the backfill list
1535      */
1536     private void eliminateBackfillStates(int baseState) {
1537 
1538         // don't do anything unless this state is actually in the backfill list...
1539         if (statesToBackfill.contains(new Integer(baseState))) {
1540 
1541             // if it is, take it out
1542             statesToBackfill.removeElement(new Integer(baseState));
1543 
1544             // then go through and recursively call this function for every
1545             // state that the base state points to
1546             short[] state = tempStateTable.elementAt(baseState);
1547             for (int i = 0; i < numCategories; i++) {
1548                 if (state[i] != 0) {
1549                     eliminateBackfillStates(state[i]);
1550                 }
1551             }
1552         }
1553     }
1554 
1555     /**
1556      * This function completes the backfilling process by actually doing the
1557      * backfilling on the states that are marked for it
1558      */
1559     private void backfillLoopingStates() {
1560         short[] state;
1561         short[] loopingState = null;
1562         int loopingStateRowNum = 0;


1591                     else if (state[j] == DONT_LOOP_FLAG) {
1592                         state[j] = 0;
1593                     }
1594                 }
1595             }
1596         }
1597     }
1598 
1599     /**
1600      * This function completes the state-table-building process by doing several
1601      * postprocessing steps and copying everything into its final resting place
1602      * in the iterator itself
1603      * @param forward True if we're working on the forward state table
1604      */
1605     private void finishBuildingStateTable(boolean forward) {
1606         // start by backfilling the looping states
1607         backfillLoopingStates();
1608 
1609         int[] rowNumMap = new int[tempStateTable.size()];
1610         Stack<Integer> rowsToFollow = new Stack<>();
1611         rowsToFollow.push(new Integer(1));
1612         rowNumMap[1] = 1;
1613 
1614         // determine which states are no longer reachable from the start state
1615         // (the reachable states will have their row numbers in the row number
1616         // map, and the nonreachable states will have zero in the row number map)
1617         while (rowsToFollow.size() != 0) {
1618             int rowNum = rowsToFollow.pop().intValue();
1619             short[] row = tempStateTable.elementAt(rowNum);
1620 
1621             for (int i = 0; i < numCategories; i++) {
1622                 if (row[i] != 0) {
1623                     if (rowNumMap[row[i]] == 0) {
1624                         rowNumMap[row[i]] = row[i];
1625                         rowsToFollow.push(new Integer(row[i]));
1626                     }
1627                 }
1628             }
1629         }
1630 
1631         boolean madeChange;
1632         int newRowNum;
1633 
1634         // algorithm for minimizing the number of states in the table adapted from
1635         // Aho & Ullman, "Principles of Compiler Design"
1636         // The basic idea here is to organize the states into classes.  When we're done,
1637         // all states in the same class can be considered identical and all but one eliminated.
1638 
1639         // initially assign states to classes based on the number of populated cells they
1640         // contain (the class number is the number of populated cells)
1641         int[] stateClasses = new int[tempStateTable.size()];
1642         int nextClass = numCategories + 1;
1643         short[] state1, state2;
1644         for (int i = 1; i < stateClasses.length; i++) {
1645             if (rowNumMap[i] == 0) {


   1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


 273                 // if the character is opening punctuation, verify that no nesting
 274                 // rules are broken, and push the character onto the stack
 275                 case '{':
 276                 case '<':
 277                 case '[':
 278                 case '(':
 279                     if (lastOpen == '<') {
 280                         error("Can't nest brackets inside <>", p, description);
 281                     }
 282                     if (lastOpen == '[' && c != '[') {
 283                         error("Can't nest anything in [] but []", p, description);
 284                     }
 285 
 286                     // if we see < anywhere except on the left-hand side of =,
 287                     // we must be seeing a variable name that was never defined
 288                     if (c == '<' && (haveEquals || havePipe)) {
 289                         error("Unknown variable name", p, description);
 290                     }
 291 
 292                     lastOpen = c;
 293                     parenStack.push(Character.valueOf((char)c));
 294                     if (c == '<') {
 295                         sawVarName = true;
 296                     }
 297                     break;
 298 
 299                 // if the character is closing punctuation, verify that it matches the
 300                 // last opening punctuation we saw, and that the brackets contain
 301                 // something, then pop the stack
 302                 case '}':
 303                 case '>':
 304                 case ']':
 305                 case ')':
 306                     char expectedClose = '\u0000';
 307                     switch (lastOpen) {
 308                         case '{':
 309                             expectedClose = '}';
 310                             break;
 311                         case '[':
 312                             expectedClose = ']';
 313                             break;


 885         //   that is, if the string ends in "aaaabc", the break will go before the first
 886         //   "a" rather than the last one.  Both of these are limitations in the design
 887         //   of RuleBasedBreakIterator and not limitations of the rule parser.
 888 
 889         int p = 0;
 890         int currentState = 1;   // don't use state number 0; 0 means "stop"
 891         int lastState = currentState;
 892         String pendingChars = "";
 893 
 894         decisionPointStack = new Stack<>();
 895         decisionPointList = new Vector<>();
 896         loopingStates = new Vector<>();
 897         statesToBackfill = new Vector<>();
 898 
 899         short[] state;
 900         boolean sawEarlyBreak = false;
 901 
 902         // if we're adding rules to the backward state table, mark the initial state
 903         // as a looping state
 904         if (!forward) {
 905             loopingStates.addElement(Integer.valueOf(1));
 906         }
 907 
 908         // put the current state on the decision point list before we start
 909         decisionPointList.addElement(Integer.valueOf(currentState)); // we want currentState to
 910                                                                  // be 1 here...
 911         currentState = tempStateTable.size() - 1;   // but after that, we want it to be
 912                                                     // 1 less than the state number of the next state
 913         while (p < rule.length()) {
 914             int c = rule.codePointAt(p);
 915             clearLoopingStates = false;
 916 
 917             // this section handles literal characters, escaped characters (which are
 918             // effectively literal characters too), the . token, and [] expressions
 919             if (c == '['
 920                 || c == '\\'
 921                 || Character.isLetter(c)
 922                 || Character.isDigit(c)
 923                 || c < ' '
 924                 || c == '.'
 925                 || c >= '\u007f') {
 926 
 927                 // if we're not on a period, isolate the expression and look up
 928                 // the corresponding category list
 929                 if (c != '.') {


 961                     else {
 962                         q = p + Character.charCount(c);
 963                     }
 964 
 965                     // look up the category list for the expression and store it
 966                     // in pendingChars
 967                     pendingChars = (String)expressions.get(rule.substring(p, q));
 968 
 969                     // advance the current position past the expression
 970                     p = q - Character.charCount(rule.codePointBefore(q));
 971                 }
 972 
 973                 // if the character we're on is a period, we end up down here
 974                 else {
 975                     int rowNum = decisionPointList.lastElement().intValue();
 976                     state = tempStateTable.elementAt(rowNum);
 977 
 978                     // if the period is followed by an asterisk, then just set the current
 979                     // state to loop back on itself
 980                     if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
 981                         decisionPointList.addElement(Integer.valueOf(state[0]));
 982                         pendingChars = "";
 983                         ++p;
 984                     }
 985 
 986                     // otherwise, fabricate a category list ("pendingChars") with
 987                     // every category in it
 988                     else {
 989                         StringBuffer temp = new StringBuffer();
 990                         for (int i = 0; i < numCategories; i++)
 991                             temp.append((char)(i + 0x100));
 992                         pendingChars = temp.toString();
 993                     }
 994                 }
 995 
 996                 // we'll end up in here for all expressions except for .*, which is
 997                 // special-cased above
 998                 if (pendingChars.length() != 0) {
 999 
1000                     // if the expression is followed by an asterisk, then push a copy
1001                     // of the current desicion point list onto the stack (this is
1002                     // the same thing we do on an opening brace)
1003                     if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') {
1004                         @SuppressWarnings("unchecked")
1005                         Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1006                         decisionPointStack.push(clone);
1007                     }
1008 
1009                     // create a new state, add it to the list of states to backfill
1010                     // if we have looping states to worry about, set its "don't make
1011                     // me an accepting state" flag if we've seen a slash, and add
1012                     // it to the end of the state table
1013                     int newState = tempStateTable.size();
1014                     if (loopingStates.size() != 0) {
1015                         statesToBackfill.addElement(Integer.valueOf(newState));
1016                     }
1017                     state = new short[numCategories + 1];
1018                     if (sawEarlyBreak) {
1019                         state[numCategories] = DONT_LOOP_FLAG;
1020                     }
1021                     tempStateTable.addElement(state);
1022 
1023                     // update everybody in the decision point list to point to
1024                     // the new state (this also performs all the reconciliation
1025                     // needed to make the table deterministic), then clear the
1026                     // decision point list
1027                     updateStateTable(decisionPointList, pendingChars, (short)newState);
1028                     decisionPointList.removeAllElements();
1029 
1030                     // add all states created since the last literal character we've
1031                     // seen to the decision point list
1032                     lastState = currentState;
1033                     do {
1034                         ++currentState;
1035                         decisionPointList.addElement(Integer.valueOf(currentState));
1036                     } while (currentState + 1 < tempStateTable.size());
1037                 }
1038             }
1039 
1040             // a { marks the beginning of an optional run of characters.  Push a
1041             // copy of the current decision point list onto the stack.  This saves
1042             // it, preventing it from being affected by whatever's inside the parentheses.
1043             // This decision point list is restored when a } is encountered.
1044             else if (c == '{') {
1045                 @SuppressWarnings("unchecked")
1046                 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1047                 decisionPointStack.push(clone);
1048             }
1049 
1050             // a } marks the end of an optional run of characters.  Pop the last decision
1051             // point list off the stack and merge it with the current decision point list.
1052             // a * denotes a repeating character or group (* after () is handled separately
1053             // below).  In addition to restoring the decision point list, modify the
1054             // current state to point to itself on the appropriate character categories.
1055             else if (c == '}' || c == '*') {
1056                 // when there's a *, update the current state to loop back on itself
1057                 // on the character categories that caused us to enter this state
1058                 if (c == '*') {
1059                     for (int i = lastState + 1; i < tempStateTable.size(); i++) {
1060                         Vector<Integer> temp = new Vector<>();
1061                         temp.addElement(Integer.valueOf(i));
1062                         updateStateTable(temp, pendingChars, (short)(lastState + 1));
1063                     }
1064                 }
1065 
1066                 // pop the top element off the decision point stack and merge
1067                 // it with the current decision point list (this causes the divergent
1068                 // paths through the state table to come together again on the next
1069                 // new state)
1070                 Vector<Integer> temp = decisionPointStack.pop();
1071                 for (int i = 0; i < decisionPointList.size(); i++)
1072                     temp.addElement(decisionPointList.elementAt(i));
1073                 decisionPointList = temp;
1074             }
1075 
1076             // a ? after a * modifies the behavior of * in cases where there is overlap
1077             // between the set of characters that repeat and the characters which follow.
1078             // Without the ?, all states following the repeating state, up to a state which
1079             // is reached by a character that doesn't overlap, will loop back into the
1080             // repeating state.  With the ?, the mark states following the *? DON'T loop
1081             // back into the repeating state.  Thus, "[a-z]*xyz" will match the longest


1104             // points before the ( (i.e., the places from which the () can be entered),
1105             // we need to keep track of the entry points in case the expression loops
1106             // (i.e., is followed by *).  We do that by creating a dummy state in the
1107             // state table and adding it to the decision point list (BEFORE it's duplicated
1108             // on the stack).  Nobody points to this state, so it'll get optimized out
1109             // at the end.  It exists only to hold the entry points in case the ()
1110             // expression loops.
1111             else if (c == '(') {
1112 
1113                 // add a new state to the state table to hold the entry points into
1114                 // the () expression
1115                 tempStateTable.addElement(new short[numCategories + 1]);
1116 
1117                 // we have to adjust lastState and currentState to account for the
1118                 // new dummy state
1119                 lastState = currentState;
1120                 ++currentState;
1121 
1122                 // add the current state to the decision point list (add it at the
1123                 // BEGINNING so we can find it later)
1124                 decisionPointList.insertElementAt(Integer.valueOf(currentState), 0);
1125 
1126                 // finally, push a copy of the current decision point list onto the
1127                 // stack (this keeps track of the active decision point list before
1128                 // the () expression), followed by an empty decision point list
1129                 // (this will hold the exit points)
1130                 @SuppressWarnings("unchecked")
1131                 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone();
1132                 decisionPointStack.push(clone);
1133                 decisionPointStack.push(new Vector<Integer>());
1134             }
1135 
1136             // a | separates alternative character sequences in a () expression.  When
1137             // a | is encountered, we add the current decision point list to the exit-point
1138             // list, and restore the decision point list to its state prior to the (.
1139             else if (c == '|') {
1140 
1141                 // pick out the top two decision point lists on the stack
1142                 Vector<Integer> oneDown = decisionPointStack.pop();
1143                 Vector<Integer> twoDown = decisionPointStack.peek();
1144                 decisionPointStack.push(oneDown);


1191                     // pop the original decision point list off the stack
1192                     Vector<Integer> temp = decisionPointStack.pop();
1193 
1194                     // we squirreled away the row number of our entry point list
1195                     // at the beginning of the original decision point list.  Fish
1196                     // that state number out and retrieve the entry point list
1197                     int tempStateNum = temp.firstElement().intValue();
1198                     short[] tempState = tempStateTable.elementAt(tempStateNum);
1199 
1200                     // merge the original decision point list with the current
1201                     // decision point list
1202                     for (int i = 0; i < decisionPointList.size(); i++)
1203                         temp.addElement(decisionPointList.elementAt(i));
1204                     decisionPointList = temp;
1205 
1206                     // finally, copy every forward reference from the entry point
1207                     // list into every state in the new decision point list
1208                     for (int i = 0; i < tempState.length; i++) {
1209                         if (tempState[i] > tempStateNum) {
1210                             updateStateTable(exitPoints,
1211                                              Character.valueOf((char)(i + 0x100)).toString(),
1212                                              tempState[i]);
1213                         }
1214                     }
1215 
1216                     // update lastState and currentState, and throw away the *
1217                     lastState = currentState;
1218                     currentState = tempStateTable.size() - 1;
1219                     ++p;
1220                 }
1221             }
1222 
1223             // a / marks the position where the break is to go if the character sequence
1224             // matches this rule.  We update the flag word of every state on the decision
1225             // point list to mark them as ending states, and take note of the fact that
1226             // we've seen the slash
1227             else if (c == '/') {
1228                 sawEarlyBreak = true;
1229                 for (int i = 0; i < decisionPointList.size(); i++) {
1230                     state = tempStateTable.elementAt(decisionPointList.
1231                                     elementAt(i).intValue());


1313      * into the state in the state table (we'll call that oldValues).  If there's a
1314      * collision (i.e., if the same cell has a nonzero value in both states, and it's
1315      * not the SAME value), then we have to reconcile the collision.  We do this by
1316      * creating a new state, adding it to the end of the state table, and using this
1317      * function recursively to merge the original two states into a single, combined
1318      * state.  This process may happen recursively (i.e., each successive level may
1319      * involve collisions).  To prevent infinite recursion, we keep a log of merge
1320      * operations.  Any time we're merging two states we've merged before, we can just
1321      * supply the row number for the result of that merge operation rather than creating
1322      * a new state just like it.
1323      * @param rowNum The row number in the state table of the state to be updated
1324      * @param newValues The state to merge it with.
1325      * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
1326      * (itself a copy of the decision point list from parseRule()).  Newly-created
1327      * states get added to the decision point list if their "parents" were on it.
1328      */
1329     private void mergeStates(int rowNum,
1330                              short[] newValues,
1331                              Vector<Integer> rowsBeingUpdated) {
1332         short[] oldValues = tempStateTable.elementAt(rowNum);
1333         boolean isLoopingState = loopingStates.contains(Integer.valueOf(rowNum));
1334 
1335         // for each of the cells in the rows we're reconciling, do...
1336         for (int i = 0; i < oldValues.length; i++) {
1337 
1338             // if they contain the same value, we don't have to do anything
1339             if (oldValues[i] == newValues[i]) {
1340                 continue;
1341             }
1342 
1343             // if oldValues is a looping state and the state the current cell points to
1344             // is too, then we can just stomp over the current value of that cell (and
1345             // set the clear-looping-states flag if necessary)
1346             else if (isLoopingState && loopingStates.contains(Integer.valueOf(oldValues[i]))) {
1347                 if (newValues[i] != 0) {
1348                     if (oldValues[i] == 0) {
1349                         clearLoopingStates = true;
1350                     }
1351                     oldValues[i] = newValues[i];
1352                 }
1353             }
1354 
1355             // if the current cell in oldValues is 0, copy in the corresponding value
1356             // from newValues
1357             else if (oldValues[i] == 0) {
1358                 oldValues[i] = newValues[i];
1359             }
1360 
1361             // the last column of each row is the flag column.  Take care to merge the
1362             // flag words correctly
1363             else if (i == numCategories) {
1364                 oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
1365             }
1366 


1384 
1385                     // add this pair of row numbers to the merge list (create it first
1386                     // if we haven't created the merge list yet)
1387                     if (mergeList == null) {
1388                         mergeList = new Vector<>();
1389                     }
1390                     mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
1391 
1392                     // create a new row to represent the merged state, and copy the
1393                     // contents of oldRow into it, then add it to the end of the
1394                     // state table and update the original row (oldValues) to point
1395                     // to the new, merged, state
1396                     short[] newRow = new short[numCategories + 1];
1397                     short[] oldRow = tempStateTable.elementAt(oldRowNum);
1398                     System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
1399                     tempStateTable.addElement(newRow);
1400                     oldValues[i] = (short)combinedRowNum;
1401 
1402                     // if the decision point list contains either of the parent rows,
1403                     // update it to include the new row as well
1404                     if ((decisionPointList.contains(Integer.valueOf(oldRowNum))
1405                             || decisionPointList.contains(Integer.valueOf(newRowNum)))
1406                         && !decisionPointList.contains(Integer.valueOf(combinedRowNum))
1407                     ) {
1408                         decisionPointList.addElement(Integer.valueOf(combinedRowNum));
1409                     }
1410 
1411                     // do the same thing with the list of rows being updated
1412                     if ((rowsBeingUpdated.contains(Integer.valueOf(oldRowNum))
1413                             || rowsBeingUpdated.contains(Integer.valueOf(newRowNum)))
1414                         && !rowsBeingUpdated.contains(Integer.valueOf(combinedRowNum))
1415                     ) {
1416                         decisionPointList.addElement(Integer.valueOf(combinedRowNum));
1417                     }
1418                     // now (groan) do the same thing for all the entries on the
1419                     // decision point stack
1420                     for (int k = 0; k < decisionPointStack.size(); k++) {
1421                         Vector<Integer> dpl = decisionPointStack.elementAt(k);
1422                         if ((dpl.contains(Integer.valueOf(oldRowNum))
1423                                 || dpl.contains(Integer.valueOf(newRowNum)))
1424                             && !dpl.contains(Integer.valueOf(combinedRowNum))
1425                         ) {
1426                             dpl.addElement(Integer.valueOf(combinedRowNum));
1427                         }
1428                     }
1429 
1430                     // FINALLY (puff puff puff), call mergeStates() recursively to copy
1431                     // the row referred to by newValues into the new row and resolve any
1432                     // conflicts that come up at that level
1433                     mergeStates(combinedRowNum, tempStateTable.elementAt(
1434                                     newValues[i]), rowsBeingUpdated);
1435                 }
1436             }
1437         }
1438         return;
1439     }
1440 
1441     /**
1442      * The merge list is a list of pairs of rows that have been merged somewhere in
1443      * the process of building this state table, along with the row number of the
1444      * row containing the merged state.  This function looks up a pair of row numbers
1445      * and returns the row number of the row they combine into.  (It returns 0 if
1446      * this pair of rows isn't in the merge list.)


1519             }
1520             statesToBackfill.removeAllElements();
1521             loopingStates.removeAllElements();
1522         }
1523 
1524         if (newLoopingStates != null) {
1525             @SuppressWarnings("unchecked")
1526             Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone();
1527             loopingStates = clone;
1528         }
1529     }
1530 
1531     /**
1532      * This removes "ending states" and states reachable from them from the
1533      * list of states to backfill.
1534      * @param The row number of the state to remove from the backfill list
1535      */
1536     private void eliminateBackfillStates(int baseState) {
1537 
1538         // don't do anything unless this state is actually in the backfill list...
1539         if (statesToBackfill.contains(Integer.valueOf(baseState))) {
1540 
1541             // if it is, take it out
1542             statesToBackfill.removeElement(Integer.valueOf(baseState));
1543 
1544             // then go through and recursively call this function for every
1545             // state that the base state points to
1546             short[] state = tempStateTable.elementAt(baseState);
1547             for (int i = 0; i < numCategories; i++) {
1548                 if (state[i] != 0) {
1549                     eliminateBackfillStates(state[i]);
1550                 }
1551             }
1552         }
1553     }
1554 
1555     /**
1556      * This function completes the backfilling process by actually doing the
1557      * backfilling on the states that are marked for it
1558      */
1559     private void backfillLoopingStates() {
1560         short[] state;
1561         short[] loopingState = null;
1562         int loopingStateRowNum = 0;


1591                     else if (state[j] == DONT_LOOP_FLAG) {
1592                         state[j] = 0;
1593                     }
1594                 }
1595             }
1596         }
1597     }
1598 
1599     /**
1600      * This function completes the state-table-building process by doing several
1601      * postprocessing steps and copying everything into its final resting place
1602      * in the iterator itself
1603      * @param forward True if we're working on the forward state table
1604      */
1605     private void finishBuildingStateTable(boolean forward) {
1606         // start by backfilling the looping states
1607         backfillLoopingStates();
1608 
1609         int[] rowNumMap = new int[tempStateTable.size()];
1610         Stack<Integer> rowsToFollow = new Stack<>();
1611         rowsToFollow.push(Integer.valueOf(1));
1612         rowNumMap[1] = 1;
1613 
1614         // determine which states are no longer reachable from the start state
1615         // (the reachable states will have their row numbers in the row number
1616         // map, and the nonreachable states will have zero in the row number map)
1617         while (rowsToFollow.size() != 0) {
1618             int rowNum = rowsToFollow.pop().intValue();
1619             short[] row = tempStateTable.elementAt(rowNum);
1620 
1621             for (int i = 0; i < numCategories; i++) {
1622                 if (row[i] != 0) {
1623                     if (rowNumMap[row[i]] == 0) {
1624                         rowNumMap[row[i]] = row[i];
1625                         rowsToFollow.push(Integer.valueOf(row[i]));
1626                     }
1627                 }
1628             }
1629         }
1630 
1631         boolean madeChange;
1632         int newRowNum;
1633 
1634         // algorithm for minimizing the number of states in the table adapted from
1635         // Aho & Ullman, "Principles of Compiler Design"
1636         // The basic idea here is to organize the states into classes.  When we're done,
1637         // all states in the same class can be considered identical and all but one eliminated.
1638 
1639         // initially assign states to classes based on the number of populated cells they
1640         // contain (the class number is the number of populated cells)
1641         int[] stateClasses = new int[tempStateTable.size()];
1642         int nextClass = numCategories + 1;
1643         short[] state1, state2;
1644         for (int i = 1; i < stateClasses.length; i++) {
1645             if (rowNumMap[i] == 0) {


< prev index next >