1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Licensed to the Apache Software Foundation (ASF) under one or more
   7  * contributor license agreements.  See the NOTICE file distributed with
   8  * this work for additional information regarding copyright ownership.
   9  * The ASF licenses this file to You under the Apache License, Version 2.0
  10  * (the "License"); you may not use this file except in compliance with
  11  * the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  * Unless required by applicable law or agreed to in writing, software
  16  * distributed under the License is distributed on an "AS IS" BASIS,
  17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  18  * See the License for the specific language governing permissions and
  19  * limitations under the License.
  20  */
  21 
  22 package com.sun.org.apache.xerces.internal.impl.dtd.models;
  23 
  24 import java.util.HashMap;
  25 
  26 import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec;
  27 import com.sun.org.apache.xerces.internal.xni.QName;
  28 
  29 /**
  30 
  31  * @version $Id: DFAContentModel.java,v 1.4 2010/08/06 23:49:43 joehw Exp $
  32  * DFAContentModel is the derivative of ContentModel that does
  33  * all of the non-trivial element content validation. This class does
  34  * the conversion from the regular expression to the DFA that
  35  * it then uses in its validation algorithm.
  36  * <p>
  37  * <b>Note:</b> Upstream work insures that this class will never see
  38  * a content model with PCDATA in it. Any model with PCDATA is 'mixed'
  39  * and is handled via the MixedContentModel class since mixed models
  40  * are very constrained in form and easily handled via a special case.
  41  * This also makes implementation of this class much easier.
  42  *
  43  * @xerces.internal
  44  *
  45  * @version $Id: DFAContentModel.java,v 1.4 2010/08/06 23:49:43 joehw Exp $
  46  */
  47 public class DFAContentModel
  48     implements ContentModelValidator {
  49 
  50     //
  51     // Constants
  52     //
  53     // special strings
  54 
  55     /** Epsilon string. */
  56     private static String fEpsilonString = "<<CMNODE_EPSILON>>";
  57 
  58     /** End-of-content string. */
  59     private static String fEOCString = "<<CMNODE_EOC>>";
  60 
  61     /** initializing static members **/
  62     static {
  63         fEpsilonString = fEpsilonString.intern();
  64         fEOCString = fEOCString.intern();
  65     }
  66 
  67     // debugging
  68 
  69     /** Set to true to debug content model validation. */
  70     private static final boolean DEBUG_VALIDATE_CONTENT = false;
  71 
  72     //
  73     // Data
  74     //
  75 
  76     /* this is the EquivClassComparator object */
  77     //private EquivClassComparator comparator = null;
  78 
  79     /**
  80      * This is the map of unique input symbol elements to indices into
  81      * each state's per-input symbol transition table entry. This is part
  82      * of the built DFA information that must be kept around to do the
  83      * actual validation.
  84      */
  85     private QName fElemMap[] = null;
  86 
  87     /**
  88      * This is a map of whether the element map contains information
  89      * related to ANY models.
  90      */
  91     private int fElemMapType[] = null;
  92 
  93     /** The element map size. */
  94     private int fElemMapSize = 0;
  95 
  96     /** Boolean to distinguish Schema Mixed Content */
  97     private boolean fMixed;
  98 
  99     /**
 100      * The NFA position of the special EOC (end of content) node. This
 101      * is saved away since it's used during the DFA build.
 102      */
 103     private int fEOCPos = 0;
 104 
 105 
 106     /**
 107      * This is an array of booleans, one per state (there are
 108      * fTransTableSize states in the DFA) that indicates whether that
 109      * state is a final state.
 110      */
 111     private boolean fFinalStateFlags[] = null;
 112 
 113     /**
 114      * The list of follow positions for each NFA position (i.e. for each
 115      * non-epsilon leaf node.) This is only used during the building of
 116      * the DFA, and is let go afterwards.
 117      */
 118     private CMStateSet fFollowList[] = null;
 119 
 120     /**
 121      * This is the head node of our intermediate representation. It is
 122      * only non-null during the building of the DFA (just so that it
 123      * does not have to be passed all around.) Once the DFA is built,
 124      * this is no longer required so its nulled out.
 125      */
 126     private CMNode fHeadNode = null;
 127 
 128     /**
 129      * The count of leaf nodes. This is an important number that set some
 130      * limits on the sizes of data structures in the DFA process.
 131      */
 132     private int fLeafCount = 0;
 133 
 134     /**
 135      * An array of non-epsilon leaf nodes, which is used during the DFA
 136      * build operation, then dropped.
 137      */
 138     private CMLeaf fLeafList[] = null;
 139 
 140     /** Array mapping ANY types to the leaf list. */
 141     private int fLeafListType[] = null;
 142 
 143     //private ContentLeafNameTypeVector fLeafNameTypeVector = null;
 144 
 145     /**
 146      * The string pool of our parser session. This is set during construction
 147      * and kept around.
 148      */
 149     //private StringPool fStringPool = null;
 150 
 151     /**
 152      * This is the transition table that is the main by product of all
 153      * of the effort here. It is an array of arrays of ints. The first
 154      * dimension is the number of states we end up with in the DFA. The
 155      * second dimensions is the number of unique elements in the content
 156      * model (fElemMapSize). Each entry in the second dimension indicates
 157      * the new state given that input for the first dimension's start
 158      * state.
 159      * <p>
 160      * The fElemMap array handles mapping from element indexes to
 161      * positions in the second dimension of the transition table.
 162      */
 163     private int fTransTable[][] = null;
 164 
 165     /**
 166      * The number of valid entries in the transition table, and in the other
 167      * related tables such as fFinalStateFlags.
 168      */
 169     private int fTransTableSize = 0;
 170 
 171     /**
 172      * Flag that indicates that even though we have a "complicated"
 173      * content model, it is valid to have no content. In other words,
 174      * all parts of the content model are optional. For example:
 175      * <pre>
 176      *      &lt;!ELEMENT AllOptional (Optional*,NotRequired?)&gt;
 177      * </pre>
 178      */
 179     private boolean fEmptyContentIsValid = false;
 180 
 181     // temp variables
 182 
 183     /** Temporary qualified name. */
 184     private final QName fQName = new QName();
 185 
 186     //
 187     // Constructors
 188     //
 189 
 190 
 191     //
 192     // Constructors
 193     //
 194 
 195     /**
 196      * Constructs a DFA content model.
 197      *
 198      * @param syntaxTree    The syntax tree of the content model.
 199      * @param leafCount     The number of leaves.
 200      * @param mixed
 201      *
 202      */
 203     public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) {
 204         // Store away our index and pools in members
 205         //fStringPool = stringPool;
 206         fLeafCount = leafCount;
 207 
 208 
 209         // this is for Schema Mixed Content
 210         fMixed = mixed;
 211 
 212         //
 213         //  Ok, so lets grind through the building of the DFA. This method
 214         //  handles the high level logic of the algorithm, but it uses a
 215         //  number of helper classes to do its thing.
 216         //
 217         //  In order to avoid having hundreds of references to the error and
 218         //  string handlers around, this guy and all of his helper classes
 219         //  just throw a simple exception and we then pass it along.
 220         //
 221         buildDFA(syntaxTree);
 222     }
 223 
 224     //
 225     // ContentModelValidator methods
 226     //
 227 
 228     /**
 229      * Check that the specified content is valid according to this
 230      * content model. This method can also be called to do 'what if'
 231      * testing of content models just to see if they would be valid.
 232      * <p>
 233      * A value of -1 in the children array indicates a PCDATA node. All other
 234      * indexes will be positive and represent child elements. The count can be
 235      * zero, since some elements have the EMPTY content model and that must be
 236      * confirmed.
 237      *
 238      * @param children The children of this element.  Each integer is an index within
 239      *                 the <code>StringPool</code> of the child element name.  An index
 240      *                 of -1 is used to indicate an occurrence of non-whitespace character
 241      *                 data.
 242      * @param offset Offset into the array where the children starts.
 243      * @param length The number of entries in the <code>children</code> array.
 244      *
 245      * @return The value -1 if fully valid, else the 0 based index of the child
 246      *         that first failed. If the value returned is equal to the number
 247      *         of children, then the specified children are valid but additional
 248      *         content is required to reach a valid ending state.
 249      *
 250      */
 251     public int validate(QName[] children, int offset, int length) {
 252 
 253         if (DEBUG_VALIDATE_CONTENT)
 254             System.out.println("DFAContentModel#validateContent");
 255 
 256         //
 257         // A DFA content model must *always* have at least 1 child
 258         // so a failure is given if no children present.
 259         //
 260         // Defect 782: This is an incorrect statement because a DFA
 261         // content model is also used for constructions such as:
 262         //
 263         //     (Optional*,NotRequired?)
 264         //
 265         // where a perfectly valid content would be NO CHILDREN.
 266         // Therefore, if there are no children, we must check to
 267         // see if the CMNODE_EOC marker is a valid start state! -Ac
 268         //
 269         if (length == 0) {
 270             if (DEBUG_VALIDATE_CONTENT) {
 271                 System.out.println("!!! no children");
 272                 System.out.println("elemMap="+fElemMap);
 273                 for (int i = 0; i < fElemMap.length; i++) {
 274                     String uri = fElemMap[i].uri;
 275                     String localpart = fElemMap[i].localpart;
 276 
 277                     System.out.println("fElemMap["+i+"]="+uri+","+
 278                                        localpart+" ("+
 279                                        uri+", "+
 280                                        localpart+
 281                                        ')');
 282 
 283                 }
 284                 System.out.println("EOCIndex="+fEOCString);
 285             }
 286 
 287             return fEmptyContentIsValid ? -1 : 0;
 288 
 289         } // if child count == 0
 290 
 291         //
 292         //  Lets loop through the children in the array and move our way
 293         //  through the states. Note that we use the fElemMap array to map
 294         //  an element index to a state index.
 295         //
 296         int curState = 0;
 297         for (int childIndex = 0; childIndex < length; childIndex++)
 298         {
 299             // Get the current element index out
 300             final QName curElem = children[offset + childIndex];
 301             // ignore mixed text
 302             if (fMixed && curElem.localpart == null) {
 303                 continue;
 304             }
 305 
 306             // Look up this child in our element map
 307             int elemIndex = 0;
 308             for (; elemIndex < fElemMapSize; elemIndex++)
 309             {
 310                 int type = fElemMapType[elemIndex] & 0x0f ;
 311                 if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) {
 312                     //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]);
 313                     if (fElemMap[elemIndex].rawname == curElem.rawname) {
 314                         break;
 315                     }
 316                 }
 317                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) {
 318                     String uri = fElemMap[elemIndex].uri;
 319                     if (uri == null || uri == curElem.uri) {
 320                         break;
 321                     }
 322                 }
 323                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) {
 324                     if (curElem.uri == null) {
 325                         break;
 326                     }
 327                 }
 328                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 329                     if (fElemMap[elemIndex].uri != curElem.uri) {
 330                         break;
 331                     }
 332                 }
 333             }
 334 
 335             // If we didn't find it, then obviously not valid
 336             if (elemIndex == fElemMapSize) {
 337                 if (DEBUG_VALIDATE_CONTENT) {
 338                     System.out.println("!!! didn't find it");
 339 
 340                     System.out.println("curElem : " +curElem );
 341                     for (int i=0; i<fElemMapSize; i++) {
 342                         System.out.println("fElemMap["+i+"] = " +fElemMap[i] );
 343                         System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] );
 344                     }
 345                 }
 346 
 347                 return childIndex;
 348             }
 349 
 350             //
 351             //  Look up the next state for this input symbol when in the
 352             //  current state.
 353             //
 354             curState = fTransTable[curState][elemIndex];
 355 
 356             // If its not a legal transition, then invalid
 357             if (curState == -1) {
 358                 if (DEBUG_VALIDATE_CONTENT)
 359                     System.out.println("!!! not a legal transition");
 360                 return childIndex;
 361             }
 362         }
 363 
 364         //
 365         //  We transitioned all the way through the input list. However, that
 366         //  does not mean that we ended in a final state. So check whether
 367         //  our ending state is a final state.
 368         //
 369         if (DEBUG_VALIDATE_CONTENT)
 370             System.out.println("curState="+curState+", childCount="+length);
 371         if (!fFinalStateFlags[curState])
 372             return length;
 373 
 374         // success!
 375         return -1;
 376     } // validate
 377 
 378 
 379     //
 380     // Private methods
 381     //
 382 
 383     /**
 384      * Builds the internal DFA transition table from the given syntax tree.
 385      *
 386      * @param syntaxTree The syntax tree.
 387      *
 388      * @exception CMException Thrown if DFA cannot be built.
 389      */
 390     private void buildDFA(CMNode syntaxTree)
 391     {
 392         //
 393         //  The first step we need to take is to rewrite the content model
 394         //  using our CMNode objects, and in the process get rid of any
 395         //  repetition short cuts, converting them into '*' style repetitions
 396         //  or getting rid of repetitions altogether.
 397         //
 398         //  The conversions done are:
 399         //
 400         //  x+ -> (x|x*)
 401         //  x? -> (x|epsilon)
 402         //
 403         //  This is a relatively complex scenario. What is happening is that
 404         //  we create a top level binary node of which the special EOC value
 405         //  is set as the right side node. The the left side is set to the
 406         //  rewritten syntax tree. The source is the original content model
 407         //  info from the decl pool. The rewrite is done by buildSyntaxTree()
 408         //  which recurses the decl pool's content of the element and builds
 409         //  a new tree in the process.
 410         //
 411         //  Note that, during this operation, we set each non-epsilon leaf
 412         //  node's DFA state position and count the number of such leafs, which
 413         //  is left in the fLeafCount member.
 414         //
 415         //  The nodeTmp object is passed in just as a temp node to use during
 416         //  the recursion. Otherwise, we'd have to create a new node on every
 417         //  level of recursion, which would be piggy in Java (as is everything
 418         //  for that matter.)
 419         //
 420 
 421         /* MODIFIED (Jan, 2001)
 422          *
 423          * Use following rules.
 424          *   nullable(x+) := nullable(x), first(x+) := first(x),  last(x+) := last(x)
 425          *   nullable(x?) := true, first(x?) := first(x),  last(x?) := last(x)
 426          *
 427          * The same computation of follow as x* is applied to x+
 428          *
 429          * The modification drastically reduces computation time of
 430          * "(a, (b, a+, (c, (b, a+)+, a+, (d,  (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
 431          */
 432 
 433         fQName.setValues(null, fEOCString, fEOCString, null);
 434         CMLeaf nodeEOC = new CMLeaf(fQName);
 435         fHeadNode = new CMBinOp
 436         (
 437             XMLContentSpec.CONTENTSPECNODE_SEQ
 438             , syntaxTree
 439             , nodeEOC
 440         );
 441 
 442         //
 443         //  And handle specially the EOC node, which also must be numbered
 444         //  and counted as a non-epsilon leaf node. It could not be handled
 445         //  in the above tree build because it was created before all that
 446         //  started. We save the EOC position since its used during the DFA
 447         //  building loop.
 448         //
 449         fEOCPos = fLeafCount;
 450         nodeEOC.setPosition(fLeafCount++);
 451 
 452         //
 453         //  Ok, so now we have to iterate the new tree and do a little more
 454         //  work now that we know the leaf count. One thing we need to do is
 455         //  to calculate the first and last position sets of each node. This
 456         //  is cached away in each of the nodes.
 457         //
 458         //  Along the way we also set the leaf count in each node as the
 459         //  maximum state count. They must know this in order to create their
 460         //  first/last pos sets.
 461         //
 462         //  We also need to build an array of references to the non-epsilon
 463         //  leaf nodes. Since we iterate it in the same way as before, this
 464         //  will put them in the array according to their position values.
 465         //
 466         fLeafList = new CMLeaf[fLeafCount];
 467         fLeafListType = new int[fLeafCount];
 468         postTreeBuildInit(fHeadNode, 0);
 469 
 470         //
 471         //  And, moving onward... We now need to build the follow position
 472         //  sets for all the nodes. So we allocate an array of state sets,
 473         //  one for each leaf node (i.e. each DFA position.)
 474         //
 475         fFollowList = new CMStateSet[fLeafCount];
 476         for (int index = 0; index < fLeafCount; index++)
 477             fFollowList[index] = new CMStateSet(fLeafCount);
 478         calcFollowList(fHeadNode);
 479         //
 480         //  And finally the big push... Now we build the DFA using all the
 481         //  states and the tree we've built up. First we set up the various
 482         //  data structures we are going to use while we do this.
 483         //
 484         //  First of all we need an array of unique element names in our
 485         //  content model. For each transition table entry, we need a set of
 486         //  contiguous indices to represent the transitions for a particular
 487         //  input element. So we need to a zero based range of indexes that
 488         //  map to element types. This element map provides that mapping.
 489         //
 490         fElemMap = new QName[fLeafCount];
 491         fElemMapType = new int[fLeafCount];
 492         fElemMapSize = 0;
 493         for (int outIndex = 0; outIndex < fLeafCount; outIndex++)
 494         {
 495             fElemMap[outIndex] = new QName();
 496 
 497             /****
 498             if ( (fLeafListType[outIndex] & 0x0f) != 0 ) {
 499                 if (fLeafNameTypeVector == null) {
 500                     fLeafNameTypeVector = new ContentLeafNameTypeVector();
 501                 }
 502             }
 503             /****/
 504 
 505             // Get the current leaf's element index
 506             final QName element = fLeafList[outIndex].getElement();
 507 
 508             // See if the current leaf node's element index is in the list
 509             int inIndex = 0;
 510             for (; inIndex < fElemMapSize; inIndex++)
 511             {
 512                 if (fElemMap[inIndex].rawname == element.rawname) {
 513                     break;
 514                 }
 515             }
 516 
 517             // If it was not in the list, then add it, if not the EOC node
 518             if (inIndex == fElemMapSize) {
 519                 fElemMap[fElemMapSize].setValues(element);
 520                 fElemMapType[fElemMapSize] = fLeafListType[outIndex];
 521                 fElemMapSize++;
 522             }
 523         }
 524         // set up the fLeafNameTypeVector object if there is one.
 525         /*****
 526         if (fLeafNameTypeVector != null) {
 527             fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize);
 528         }
 529 
 530         /***
 531         * Optimization(Jan, 2001); We sort fLeafList according to
 532         * elemIndex which is *uniquely* associated to each leaf.
 533         * We are *assuming* that each element appears in at least one leaf.
 534         **/
 535 
 536         int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
 537         int fSortCount = 0;
 538 
 539         for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
 540             for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
 541                     final QName leaf = fLeafList[leafIndex].getElement();
 542                     final QName element = fElemMap[elemIndex];
 543                     if (leaf.rawname == element.rawname) {
 544                             fLeafSorter[fSortCount++] = leafIndex;
 545                     }
 546             }
 547             fLeafSorter[fSortCount++] = -1;
 548         }
 549 
 550         /* Optimization(Jan, 2001) */
 551 
 552         //
 553         //  Next lets create some arrays, some that that hold transient
 554         //  information during the DFA build and some that are permament.
 555         //  These are kind of sticky since we cannot know how big they will
 556         //  get, but we don't want to use any Java collections because of
 557         //  performance.
 558         //
 559         //  Basically they will probably be about fLeafCount*2 on average,
 560         //  but can be as large as 2^(fLeafCount*2), worst case. So we start
 561         //  with fLeafCount*4 as a middle ground. This will be very unlikely
 562         //  to ever have to expand, though it if does, the overhead will be
 563         //  somewhat ugly.
 564         //
 565         int curArraySize = fLeafCount * 4;
 566         CMStateSet[] statesToDo = new CMStateSet[curArraySize];
 567         fFinalStateFlags = new boolean[curArraySize];
 568         fTransTable = new int[curArraySize][];
 569 
 570         //
 571         //  Ok we start with the initial set as the first pos set of the
 572         //  head node (which is the seq node that holds the content model
 573         //  and the EOC node.)
 574         //
 575         CMStateSet setT = fHeadNode.firstPos();
 576 
 577         //
 578         //  Init our two state flags. Basically the unmarked state counter
 579         //  is always chasing the current state counter. When it catches up,
 580         //  that means we made a pass through that did not add any new states
 581         //  to the lists, at which time we are done. We could have used a
 582         //  expanding array of flags which we used to mark off states as we
 583         //  complete them, but this is easier though less readable maybe.
 584         //
 585         int unmarkedState = 0;
 586         int curState = 0;
 587 
 588         //
 589         //  Init the first transition table entry, and put the initial state
 590         //  into the states to do list, then bump the current state.
 591         //
 592         fTransTable[curState] = makeDefStateList();
 593         statesToDo[curState] = setT;
 594         curState++;
 595 
 596             /* Optimization(Jan, 2001); This is faster for
 597              * a large content model such as, "(t001+|t002+|.... |t500+)".
 598              */
 599 
 600         HashMap stateTable = new HashMap();
 601 
 602             /* Optimization(Jan, 2001) */
 603 
 604         //
 605         //  Ok, almost done with the algorithm... We now enter the
 606         //  loop where we go until the states done counter catches up with
 607         //  the states to do counter.
 608         //
 609         while (unmarkedState < curState)
 610         {
 611             //
 612             //  Get the first unmarked state out of the list of states to do.
 613             //  And get the associated transition table entry.
 614             //
 615             setT = statesToDo[unmarkedState];
 616             int[] transEntry = fTransTable[unmarkedState];
 617 
 618             // Mark this one final if it contains the EOC state
 619             fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos);
 620 
 621             // Bump up the unmarked state count, marking this state done
 622             unmarkedState++;
 623 
 624             // Loop through each possible input symbol in the element map
 625             CMStateSet newSet = null;
 626             /* Optimization(Jan, 2001) */
 627             int sorterIndex = 0;
 628             /* Optimization(Jan, 2001) */
 629             for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
 630             {
 631                 //
 632                 //  Build up a set of states which is the union of all of
 633                 //  the follow sets of DFA positions that are in the current
 634                 //  state. If we gave away the new set last time through then
 635                 //  create a new one. Otherwise, zero out the existing one.
 636                 //
 637                 if (newSet == null)
 638                     newSet = new CMStateSet(fLeafCount);
 639                 else
 640                     newSet.zeroBits();
 641 
 642             /* Optimization(Jan, 2001) */
 643                 int leafIndex = fLeafSorter[sorterIndex++];
 644 
 645                 while (leafIndex != -1) {
 646                 // If this leaf index (DFA position) is in the current set...
 647                     if (setT.getBit(leafIndex))
 648                     {
 649                         //
 650                         //  If this leaf is the current input symbol, then we
 651                         //  want to add its follow list to the set of states to
 652                         //  transition to from the current state.
 653                         //
 654                                 newSet.union(fFollowList[leafIndex]);
 655                             }
 656 
 657                    leafIndex = fLeafSorter[sorterIndex++];
 658         }
 659             /* Optimization(Jan, 2001) */
 660 
 661                 //
 662                 //  If this new set is not empty, then see if its in the list
 663                 //  of states to do. If not, then add it.
 664                 //
 665                 if (!newSet.isEmpty())
 666                 {
 667                     //
 668                     //  Search the 'states to do' list to see if this new
 669                     //  state set is already in there.
 670                     //
 671 
 672             /* Optimization(Jan, 2001) */
 673             Integer stateObj = (Integer)stateTable.get(newSet);
 674             int stateIndex = (stateObj == null ? curState : stateObj.intValue());
 675             /* Optimization(Jan, 2001) */
 676 
 677                     // If we did not find it, then add it
 678                     if (stateIndex == curState)
 679                     {
 680                         //
 681                         //  Put this new state into the states to do and init
 682                         //  a new entry at the same index in the transition
 683                         //  table.
 684                         //
 685                         statesToDo[curState] = newSet;
 686                         fTransTable[curState] = makeDefStateList();
 687 
 688             /* Optimization(Jan, 2001) */
 689                         stateTable.put(newSet, new Integer(curState));
 690             /* Optimization(Jan, 2001) */
 691 
 692                         // We now have a new state to do so bump the count
 693                         curState++;
 694 
 695                         //
 696                         //  Null out the new set to indicate we adopted it.
 697                         //  This will cause the creation of a new set on the
 698                         //  next time around the loop.
 699                         //
 700                         newSet = null;
 701                     }
 702 
 703                     //
 704                     //  Now set this state in the transition table's entry
 705                     //  for this element (using its index), with the DFA
 706                     //  state we will move to from the current state when we
 707                     //  see this input element.
 708                     //
 709                     transEntry[elemIndex] = stateIndex;
 710 
 711                     // Expand the arrays if we're full
 712                     if (curState == curArraySize)
 713                     {
 714                         //
 715                         //  Yikes, we overflowed the initial array size, so
 716                         //  we've got to expand all of these arrays. So adjust
 717                         //  up the size by 50% and allocate new arrays.
 718                         //
 719                         final int newSize = (int)(curArraySize * 1.5);
 720                         CMStateSet[] newToDo = new CMStateSet[newSize];
 721                         boolean[] newFinalFlags = new boolean[newSize];
 722                         int[][] newTransTable = new int[newSize][];
 723 
 724                         // Copy over all of the existing content
 725                         System.arraycopy(statesToDo, 0, newToDo, 0, curArraySize);
 726                         System.arraycopy(fFinalStateFlags, 0, newFinalFlags, 0, curArraySize);
 727                         System.arraycopy(fTransTable, 0, newTransTable, 0, curArraySize);
 728 
 729                         // Store the new array size
 730                         curArraySize = newSize;
 731                         statesToDo = newToDo;
 732                         fFinalStateFlags = newFinalFlags;
 733                         fTransTable = newTransTable;
 734                     }
 735                 }
 736             }
 737         }
 738 
 739         // Check to see if we can set the fEmptyContentIsValid flag.
 740         fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable();
 741 
 742         //
 743         //  And now we can say bye bye to the temp representation since we've
 744         //  built the DFA.
 745         //
 746         if (DEBUG_VALIDATE_CONTENT)
 747             dumpTree(fHeadNode, 0);
 748         fHeadNode = null;
 749         fLeafList = null;
 750         fFollowList = null;
 751 
 752     }
 753 
 754     /**
 755      * Calculates the follow list of the current node.
 756      *
 757      * @param nodeCur The curent node.
 758      *
 759      * @exception CMException Thrown if follow list cannot be calculated.
 760      */
 761     private void calcFollowList(CMNode nodeCur)
 762     {
 763         // Recurse as required
 764         if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 765         {
 766             // Recurse only
 767             calcFollowList(((CMBinOp)nodeCur).getLeft());
 768             calcFollowList(((CMBinOp)nodeCur).getRight());
 769         }
 770          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)
 771         {
 772             // Recurse first
 773             calcFollowList(((CMBinOp)nodeCur).getLeft());
 774             calcFollowList(((CMBinOp)nodeCur).getRight());
 775 
 776             //
 777             //  Now handle our level. We use our left child's last pos
 778             //  set and our right child's first pos set, so go ahead and
 779             //  get them ahead of time.
 780             //
 781             final CMStateSet last  = ((CMBinOp)nodeCur).getLeft().lastPos();
 782             final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos();
 783 
 784             //
 785             //  Now, for every position which is in our left child's last set
 786             //  add all of the states in our right child's first set to the
 787             //  follow set for that position.
 788             //
 789             for (int index = 0; index < fLeafCount; index++)
 790             {
 791                 if (last.getBit(index))
 792                     fFollowList[index].union(first);
 793             }
 794         }
 795          /***
 796          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 797         {
 798             // Recurse first
 799             calcFollowList(((CMUniOp)nodeCur).getChild());
 800 
 801             //
 802             //  Now handle our level. We use our own first and last position
 803             //  sets, so get them up front.
 804             //
 805             final CMStateSet first = nodeCur.firstPos();
 806             final CMStateSet last  = nodeCur.lastPos();
 807 
 808             //
 809             //  For every position which is in our last position set, add all
 810             //  of our first position states to the follow set for that
 811             //  position.
 812             //
 813             for (int index = 0; index < fLeafCount; index++)
 814             {
 815                 if (last.getBit(index))
 816                     fFollowList[index].union(first);
 817             }
 818         }
 819          else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 820               ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE))
 821         {
 822             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 823         }
 824         /***/
 825          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
 826             || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 827         {
 828             // Recurse first
 829             calcFollowList(((CMUniOp)nodeCur).getChild());
 830 
 831             //
 832             //  Now handle our level. We use our own first and last position
 833             //  sets, so get them up front.
 834             //
 835             final CMStateSet first = nodeCur.firstPos();
 836             final CMStateSet last  = nodeCur.lastPos();
 837 
 838             //
 839             //  For every position which is in our last position set, add all
 840             //  of our first position states to the follow set for that
 841             //  position.
 842             //
 843             for (int index = 0; index < fLeafCount; index++)
 844             {
 845                 if (last.getBit(index))
 846                     fFollowList[index].union(first);
 847             }
 848         }
 849 
 850         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) {
 851             // Recurse only
 852             calcFollowList(((CMUniOp)nodeCur).getChild());
 853         }
 854          /***/
 855     }
 856 
 857     /**
 858      * Dumps the tree of the current node to standard output.
 859      *
 860      * @param nodeCur The current node.
 861      * @param level   The maximum levels to output.
 862      *
 863      * @exception CMException Thrown on error.
 864      */
 865     private void dumpTree(CMNode nodeCur, int level)
 866     {
 867         for (int index = 0; index < level; index++)
 868             System.out.print("   ");
 869 
 870         int type = nodeCur.type();
 871         if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 872         ||  (type == XMLContentSpec.CONTENTSPECNODE_SEQ))
 873         {
 874             if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 875                 System.out.print("Choice Node ");
 876             else
 877                 System.out.print("Seq Node ");
 878 
 879             if (nodeCur.isNullable())
 880                 System.out.print("Nullable ");
 881 
 882             System.out.print("firstPos=");
 883             System.out.print(nodeCur.firstPos().toString());
 884             System.out.print(" lastPos=");
 885             System.out.println(nodeCur.lastPos().toString());
 886 
 887             dumpTree(((CMBinOp)nodeCur).getLeft(), level+1);
 888             dumpTree(((CMBinOp)nodeCur).getRight(), level+1);
 889         }
 890          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 891         {
 892             System.out.print("Rep Node ");
 893 
 894             if (nodeCur.isNullable())
 895                 System.out.print("Nullable ");
 896 
 897             System.out.print("firstPos=");
 898             System.out.print(nodeCur.firstPos().toString());
 899             System.out.print(" lastPos=");
 900             System.out.println(nodeCur.lastPos().toString());
 901 
 902             dumpTree(((CMUniOp)nodeCur).getChild(), level+1);
 903         }
 904          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
 905         {
 906             System.out.print
 907             (
 908                 "Leaf: (pos="
 909                 + ((CMLeaf)nodeCur).getPosition()
 910                 + "), "
 911                 + ((CMLeaf)nodeCur).getElement()
 912                 + "(elemIndex="
 913                 + ((CMLeaf)nodeCur).getElement()
 914                 + ") "
 915             );
 916 
 917             if (nodeCur.isNullable())
 918                 System.out.print(" Nullable ");
 919 
 920             System.out.print("firstPos=");
 921             System.out.print(nodeCur.firstPos().toString());
 922             System.out.print(" lastPos=");
 923             System.out.println(nodeCur.lastPos().toString());
 924         }
 925          else
 926         {
 927             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 928         }
 929     }
 930 
 931 
 932     /**
 933      * -1 is used to represent bad transitions in the transition table
 934      * entry for each state. So each entry is initialized to an all -1
 935      * array. This method creates a new entry and initializes it.
 936      */
 937     private int[] makeDefStateList()
 938     {
 939         int[] retArray = new int[fElemMapSize];
 940         for (int index = 0; index < fElemMapSize; index++)
 941             retArray[index] = -1;
 942         return retArray;
 943     }
 944 
 945     /** Post tree build initialization. */
 946     private int postTreeBuildInit(CMNode nodeCur, int curIndex)
 947     {
 948         // Set the maximum states on this node
 949         nodeCur.setMaxStates(fLeafCount);
 950 
 951         // Recurse as required
 952         if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY ||
 953             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL ||
 954             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 955             // REVISIT: Don't waste these structures.
 956             QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI());
 957             fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition());
 958             fLeafListType[curIndex] = nodeCur.type();
 959             curIndex++;
 960         }
 961         else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 962         ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ))
 963         {
 964             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex);
 965             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex);
 966         }
 967          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
 968              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE
 969              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)
 970         {
 971             curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex);
 972         }
 973          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
 974         {
 975             //
 976             //  Put this node in the leaf list at the current index if its
 977             //  a non-epsilon leaf.
 978             //
 979              final QName node = ((CMLeaf)nodeCur).getElement();
 980             if (node.localpart != fEpsilonString) {
 981                 fLeafList[curIndex] = (CMLeaf)nodeCur;
 982                 fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF;
 983                 curIndex++;
 984             }
 985         }
 986          else
 987         {
 988             throw new RuntimeException("ImplementationMessages.VAL_NIICM: type="+nodeCur.type());
 989         }
 990         return curIndex;
 991     }
 992 
 993 } // class DFAContentModel