New src/com/sun/org/apache/xerces/internal/impl/dtd/models/DFAContentModel.java

   1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * The Apache Software License, Version 1.1
   7  *
   8  *
   9  * Copyright (c) 1999-2002 The Apache Software Foundation.  All rights
  10  * reserved.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  *
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  *
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in
  21  *    the documentation and/or other materials provided with the
  22  *    distribution.
  23  *
  24  * 3. The end-user documentation included with the redistribution,
  25  *    if any, must include the following acknowledgment:
  26  *       "This product includes software developed by the
  27  *        Apache Software Foundation (http://www.apache.org/)."
  28  *    Alternately, this acknowledgment may appear in the software itself,
  29  *    if and wherever such third-party acknowledgments normally appear.
  30  *
  31  * 4. The names "Xerces" and "Apache Software Foundation" must
  32  *    not be used to endorse or promote products derived from this
  33  *    software without prior written permission. For written
  34  *    permission, please contact apache@apache.org.
  35  *
  36  * 5. Products derived from this software may not be called "Apache",
  37  *    nor may "Apache" appear in their name, without prior written
  38  *    permission of the Apache Software Foundation.
  39  *
  40  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  41  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  42  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  43  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  44  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  45  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  46  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  47  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  48  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  49  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  50  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51  * SUCH DAMAGE.
  52  * ====================================================================
  53  *
  54  * This software consists of voluntary contributions made by many
  55  * individuals on behalf of the Apache Software Foundation and was
  56  * originally based on software copyright (c) 1999, International
  57  * Business Machines, Inc., http://www.apache.org.  For more
  58  * information on the Apache Software Foundation, please see
  59  * <http://www.apache.org/>.
  60  */
  61 
  62 package com.sun.org.apache.xerces.internal.impl.dtd.models;
  63 
  64 import java.util.HashMap;
  65 
  66 import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec;
  67 import com.sun.org.apache.xerces.internal.xni.QName;
  68 
  69 /**
  70 
  71  * DFAContentModel is the derivative of ContentModel that does
  72  * all of the non-trivial element content validation. This class does
  73  * the conversion from the regular expression to the DFA that
  74  * it then uses in its validation algorithm.
  75  * <p>
  76  * <b>Note:</b> Upstream work insures that this class will never see
  77  * a content model with PCDATA in it. Any model with PCDATA is 'mixed'
  78  * and is handled via the MixedContentModel class since mixed models
  79  * are very constrained in form and easily handled via a special case.
  80  * This also makes implementation of this class much easier.
  81  *
  82  * @xerces.internal
  83  *
  84  */
  85 public class DFAContentModel
  86     implements ContentModelValidator {
  87 
  88     //
  89     // Constants
  90     //
  91     // special strings
  92 
  93     /** Epsilon string. */
  94     private static String fEpsilonString = "<<CMNODE_EPSILON>>";
  95 
  96     /** End-of-content string. */
  97     private static String fEOCString = "<<CMNODE_EOC>>";
  98 
  99     /** initializing static members **/
 100     static {
 101         fEpsilonString = fEpsilonString.intern();
 102         fEOCString = fEOCString.intern();
 103     }
 104 
 105     // debugging
 106 
 107     /** Set to true to debug content model validation. */
 108     private static final boolean DEBUG_VALIDATE_CONTENT = false;
 109 
 110     //
 111     // Data
 112     //
 113 
 114     /* this is the EquivClassComparator object */
 115     //private EquivClassComparator comparator = null;
 116 
 117     /**
 118      * This is the map of unique input symbol elements to indices into
 119      * each state's per-input symbol transition table entry. This is part
 120      * of the built DFA information that must be kept around to do the
 121      * actual validation.
 122      */
 123     private QName fElemMap[] = null;
 124 
 125     /**
 126      * This is a map of whether the element map contains information
 127      * related to ANY models.
 128      */
 129     private int fElemMapType[] = null;
 130 
 131     /** The element map size. */
 132     private int fElemMapSize = 0;
 133 
 134     /** Boolean to distinguish Schema Mixed Content */
 135     private boolean fMixed;
 136 
 137     /**
 138      * The NFA position of the special EOC (end of content) node. This
 139      * is saved away since it's used during the DFA build.
 140      */
 141     private int fEOCPos = 0;
 142 
 143 
 144     /**
 145      * This is an array of booleans, one per state (there are
 146      * fTransTableSize states in the DFA) that indicates whether that
 147      * state is a final state.
 148      */
 149     private boolean fFinalStateFlags[] = null;
 150 
 151     /**
 152      * The list of follow positions for each NFA position (i.e. for each
 153      * non-epsilon leaf node.) This is only used during the building of
 154      * the DFA, and is let go afterwards.
 155      */
 156     private CMStateSet fFollowList[] = null;
 157 
 158     /**
 159      * This is the head node of our intermediate representation. It is
 160      * only non-null during the building of the DFA (just so that it
 161      * does not have to be passed all around.) Once the DFA is built,
 162      * this is no longer required so its nulled out.
 163      */
 164     private CMNode fHeadNode = null;
 165 
 166     /**
 167      * The count of leaf nodes. This is an important number that set some
 168      * limits on the sizes of data structures in the DFA process.
 169      */
 170     private int fLeafCount = 0;
 171 
 172     /**
 173      * An array of non-epsilon leaf nodes, which is used during the DFA
 174      * build operation, then dropped.
 175      */
 176     private CMLeaf fLeafList[] = null;
 177 
 178     /** Array mapping ANY types to the leaf list. */
 179     private int fLeafListType[] = null;
 180 
 181     //private ContentLeafNameTypeVector fLeafNameTypeVector = null;
 182 
 183     /**
 184      * The string pool of our parser session. This is set during construction
 185      * and kept around.
 186      */
 187     //private StringPool fStringPool = null;
 188 
 189     /**
 190      * This is the transition table that is the main by product of all
 191      * of the effort here. It is an array of arrays of ints. The first
 192      * dimension is the number of states we end up with in the DFA. The
 193      * second dimensions is the number of unique elements in the content
 194      * model (fElemMapSize). Each entry in the second dimension indicates
 195      * the new state given that input for the first dimension's start
 196      * state.
 197      * <p>
 198      * The fElemMap array handles mapping from element indexes to
 199      * positions in the second dimension of the transition table.
 200      */
 201     private int fTransTable[][] = null;
 202 
 203     /**
 204      * The number of valid entries in the transition table, and in the other
 205      * related tables such as fFinalStateFlags.
 206      */
 207     private int fTransTableSize = 0;
 208 
 209     /**
 210      * Flag that indicates that even though we have a "complicated"
 211      * content model, it is valid to have no content. In other words,
 212      * all parts of the content model are optional. For example:
 213      * <pre>
 214      *      &lt;!ELEMENT AllOptional (Optional*,NotRequired?)&gt;
 215      * </pre>
 216      */
 217     private boolean fEmptyContentIsValid = false;
 218 
 219     // temp variables
 220 
 221     /** Temporary qualified name. */
 222     private final QName fQName = new QName();
 223 
 224     //
 225     // Constructors
 226     //
 227 
 228 
 229     //
 230     // Constructors
 231     //
 232 
 233     /**
 234      * Constructs a DFA content model.
 235      *
 236      * @param syntaxTree    The syntax tree of the content model.
 237      * @param leafCount     The number of leaves.
 238      * @param mixed
 239      *
 240      */
 241     public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) {
 242         // Store away our index and pools in members
 243         //fStringPool = stringPool;
 244         fLeafCount = leafCount;
 245 
 246 
 247         // this is for Schema Mixed Content
 248         fMixed = mixed;
 249 
 250         //
 251         //  Ok, so lets grind through the building of the DFA. This method
 252         //  handles the high level logic of the algorithm, but it uses a
 253         //  number of helper classes to do its thing.
 254         //
 255         //  In order to avoid having hundreds of references to the error and
 256         //  string handlers around, this guy and all of his helper classes
 257         //  just throw a simple exception and we then pass it along.
 258         //
 259         buildDFA(syntaxTree);
 260     }
 261 
 262     //
 263     // ContentModelValidator methods
 264     //
 265 
 266     /**
 267      * Check that the specified content is valid according to this
 268      * content model. This method can also be called to do 'what if'
 269      * testing of content models just to see if they would be valid.
 270      * <p>
 271      * A value of -1 in the children array indicates a PCDATA node. All other
 272      * indexes will be positive and represent child elements. The count can be
 273      * zero, since some elements have the EMPTY content model and that must be
 274      * confirmed.
 275      *
 276      * @param children The children of this element.  Each integer is an index within
 277      *                 the <code>StringPool</code> of the child element name.  An index
 278      *                 of -1 is used to indicate an occurrence of non-whitespace character
 279      *                 data.
 280      * @param offset Offset into the array where the children starts.
 281      * @param length The number of entries in the <code>children</code> array.
 282      *
 283      * @return The value -1 if fully valid, else the 0 based index of the child
 284      *         that first failed. If the value returned is equal to the number
 285      *         of children, then the specified children are valid but additional
 286      *         content is required to reach a valid ending state.
 287      *
 288      */
 289     public int validate(QName[] children, int offset, int length) {
 290 
 291         if (DEBUG_VALIDATE_CONTENT)
 292             System.out.println("DFAContentModel#validateContent");
 293 
 294         //
 295         // A DFA content model must *always* have at least 1 child
 296         // so a failure is given if no children present.
 297         //
 298         // Defect 782: This is an incorrect statement because a DFA
 299         // content model is also used for constructions such as:
 300         //
 301         //     (Optional*,NotRequired?)
 302         //
 303         // where a perfectly valid content would be NO CHILDREN.
 304         // Therefore, if there are no children, we must check to
 305         // see if the CMNODE_EOC marker is a valid start state! -Ac
 306         //
 307         if (length == 0) {
 308             if (DEBUG_VALIDATE_CONTENT) {
 309                 System.out.println("!!! no children");
 310                 System.out.println("elemMap="+fElemMap);
 311                 for (int i = 0; i < fElemMap.length; i++) {
 312                     String uri = fElemMap[i].uri;
 313                     String localpart = fElemMap[i].localpart;
 314 
 315                     System.out.println("fElemMap["+i+"]="+uri+","+
 316                                        localpart+" ("+
 317                                        uri+", "+
 318                                        localpart+
 319                                        ')');
 320 
 321                 }
 322                 System.out.println("EOCIndex="+fEOCString);
 323             }
 324 
 325             return fEmptyContentIsValid ? -1 : 0;
 326 
 327         } // if child count == 0
 328 
 329         //
 330         //  Lets loop through the children in the array and move our way
 331         //  through the states. Note that we use the fElemMap array to map
 332         //  an element index to a state index.
 333         //
 334         int curState = 0;
 335         for (int childIndex = 0; childIndex < length; childIndex++)
 336         {
 337             // Get the current element index out
 338             final QName curElem = children[offset + childIndex];
 339             // ignore mixed text
 340             if (fMixed && curElem.localpart == null) {
 341                 continue;
 342             }
 343 
 344             // Look up this child in our element map
 345             int elemIndex = 0;
 346             for (; elemIndex < fElemMapSize; elemIndex++)
 347             {
 348                 int type = fElemMapType[elemIndex] & 0x0f ;
 349                 if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) {
 350                     //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]);
 351                     if (fElemMap[elemIndex].rawname == curElem.rawname) {
 352                         break;
 353                     }
 354                 }
 355                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) {
 356                     String uri = fElemMap[elemIndex].uri;
 357                     if (uri == null || uri == curElem.uri) {
 358                         break;
 359                     }
 360                 }
 361                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) {
 362                     if (curElem.uri == null) {
 363                         break;
 364                     }
 365                 }
 366                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 367                     if (fElemMap[elemIndex].uri != curElem.uri) {
 368                         break;
 369                     }
 370                 }
 371             }
 372 
 373             // If we didn't find it, then obviously not valid
 374             if (elemIndex == fElemMapSize) {
 375                 if (DEBUG_VALIDATE_CONTENT) {
 376                     System.out.println("!!! didn't find it");
 377 
 378                     System.out.println("curElem : " +curElem );
 379                     for (int i=0; i<fElemMapSize; i++) {
 380                         System.out.println("fElemMap["+i+"] = " +fElemMap[i] );
 381                         System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] );
 382                     }
 383                 }
 384 
 385                 return childIndex;
 386             }
 387 
 388             //
 389             //  Look up the next state for this input symbol when in the
 390             //  current state.
 391             //
 392             curState = fTransTable[curState][elemIndex];
 393 
 394             // If its not a legal transition, then invalid
 395             if (curState == -1) {
 396                 if (DEBUG_VALIDATE_CONTENT)
 397                     System.out.println("!!! not a legal transition");
 398                 return childIndex;
 399             }
 400         }
 401 
 402         //
 403         //  We transitioned all the way through the input list. However, that
 404         //  does not mean that we ended in a final state. So check whether
 405         //  our ending state is a final state.
 406         //
 407         if (DEBUG_VALIDATE_CONTENT)
 408             System.out.println("curState="+curState+", childCount="+length);
 409         if (!fFinalStateFlags[curState])
 410             return length;
 411 
 412         // success!
 413         return -1;
 414     } // validate
 415 
 416 
 417     //
 418     // Private methods
 419     //
 420 
 421     /**
 422      * Builds the internal DFA transition table from the given syntax tree.
 423      *
 424      * @param syntaxTree The syntax tree.
 425      *
 426      * @exception CMException Thrown if DFA cannot be built.
 427      */
 428     private void buildDFA(CMNode syntaxTree)
 429     {
 430         //
 431         //  The first step we need to take is to rewrite the content model
 432         //  using our CMNode objects, and in the process get rid of any
 433         //  repetition short cuts, converting them into '*' style repetitions
 434         //  or getting rid of repetitions altogether.
 435         //
 436         //  The conversions done are:
 437         //
 438         //  x+ -> (x|x*)
 439         //  x? -> (x|epsilon)
 440         //
 441         //  This is a relatively complex scenario. What is happening is that
 442         //  we create a top level binary node of which the special EOC value
 443         //  is set as the right side node. The the left side is set to the
 444         //  rewritten syntax tree. The source is the original content model
 445         //  info from the decl pool. The rewrite is done by buildSyntaxTree()
 446         //  which recurses the decl pool's content of the element and builds
 447         //  a new tree in the process.
 448         //
 449         //  Note that, during this operation, we set each non-epsilon leaf
 450         //  node's DFA state position and count the number of such leafs, which
 451         //  is left in the fLeafCount member.
 452         //
 453         //  The nodeTmp object is passed in just as a temp node to use during
 454         //  the recursion. Otherwise, we'd have to create a new node on every
 455         //  level of recursion, which would be piggy in Java (as is everything
 456         //  for that matter.)
 457         //
 458 
 459         /* MODIFIED (Jan, 2001)
 460          *
 461          * Use following rules.
 462          *   nullable(x+) := nullable(x), first(x+) := first(x),  last(x+) := last(x)
 463          *   nullable(x?) := true, first(x?) := first(x),  last(x?) := last(x)
 464          *
 465          * The same computation of follow as x* is applied to x+
 466          *
 467          * The modification drastically reduces computation time of
 468          * "(a, (b, a+, (c, (b, a+)+, a+, (d,  (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
 469          */
 470 
 471         fQName.setValues(null, fEOCString, fEOCString, null);
 472         CMLeaf nodeEOC = new CMLeaf(fQName);
 473         fHeadNode = new CMBinOp
 474         (
 475             XMLContentSpec.CONTENTSPECNODE_SEQ
 476             , syntaxTree
 477             , nodeEOC
 478         );
 479 
 480         //
 481         //  And handle specially the EOC node, which also must be numbered
 482         //  and counted as a non-epsilon leaf node. It could not be handled
 483         //  in the above tree build because it was created before all that
 484         //  started. We save the EOC position since its used during the DFA
 485         //  building loop.
 486         //
 487         fEOCPos = fLeafCount;
 488         nodeEOC.setPosition(fLeafCount++);
 489 
 490         //
 491         //  Ok, so now we have to iterate the new tree and do a little more
 492         //  work now that we know the leaf count. One thing we need to do is
 493         //  to calculate the first and last position sets of each node. This
 494         //  is cached away in each of the nodes.
 495         //
 496         //  Along the way we also set the leaf count in each node as the
 497         //  maximum state count. They must know this in order to create their
 498         //  first/last pos sets.
 499         //
 500         //  We also need to build an array of references to the non-epsilon
 501         //  leaf nodes. Since we iterate it in the same way as before, this
 502         //  will put them in the array according to their position values.
 503         //
 504         fLeafList = new CMLeaf[fLeafCount];
 505         fLeafListType = new int[fLeafCount];
 506         postTreeBuildInit(fHeadNode, 0);
 507 
 508         //
 509         //  And, moving onward... We now need to build the follow position
 510         //  sets for all the nodes. So we allocate an array of state sets,
 511         //  one for each leaf node (i.e. each DFA position.)
 512         //
 513         fFollowList = new CMStateSet[fLeafCount];
 514         for (int index = 0; index < fLeafCount; index++)
 515             fFollowList[index] = new CMStateSet(fLeafCount);
 516         calcFollowList(fHeadNode);
 517         //
 518         //  And finally the big push... Now we build the DFA using all the
 519         //  states and the tree we've built up. First we set up the various
 520         //  data structures we are going to use while we do this.
 521         //
 522         //  First of all we need an array of unique element names in our
 523         //  content model. For each transition table entry, we need a set of
 524         //  contiguous indices to represent the transitions for a particular
 525         //  input element. So we need to a zero based range of indexes that
 526         //  map to element types. This element map provides that mapping.
 527         //
 528         fElemMap = new QName[fLeafCount];
 529         fElemMapType = new int[fLeafCount];
 530         fElemMapSize = 0;
 531         for (int outIndex = 0; outIndex < fLeafCount; outIndex++)
 532         {
 533             fElemMap[outIndex] = new QName();
 534 
 535             /****
 536             if ( (fLeafListType[outIndex] & 0x0f) != 0 ) {
 537                 if (fLeafNameTypeVector == null) {
 538                     fLeafNameTypeVector = new ContentLeafNameTypeVector();
 539                 }
 540             }
 541             /****/
 542 
 543             // Get the current leaf's element index
 544             final QName element = fLeafList[outIndex].getElement();
 545 
 546             // See if the current leaf node's element index is in the list
 547             int inIndex = 0;
 548             for (; inIndex < fElemMapSize; inIndex++)
 549             {
 550                 if (fElemMap[inIndex].rawname == element.rawname) {
 551                     break;
 552                 }
 553             }
 554 
 555             // If it was not in the list, then add it, if not the EOC node
 556             if (inIndex == fElemMapSize) {
 557                 fElemMap[fElemMapSize].setValues(element);
 558                 fElemMapType[fElemMapSize] = fLeafListType[outIndex];
 559                 fElemMapSize++;
 560             }
 561         }
 562         // set up the fLeafNameTypeVector object if there is one.
 563         /*****
 564         if (fLeafNameTypeVector != null) {
 565             fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize);
 566         }
 567 
 568         /***
 569         * Optimization(Jan, 2001); We sort fLeafList according to
 570         * elemIndex which is *uniquely* associated to each leaf.
 571         * We are *assuming* that each element appears in at least one leaf.
 572         **/
 573 
 574         int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
 575         int fSortCount = 0;
 576 
 577         for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
 578             for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
 579                     final QName leaf = fLeafList[leafIndex].getElement();
 580                     final QName element = fElemMap[elemIndex];
 581                     if (leaf.rawname == element.rawname) {
 582                             fLeafSorter[fSortCount++] = leafIndex;
 583                     }
 584             }
 585             fLeafSorter[fSortCount++] = -1;
 586         }
 587 
 588         /* Optimization(Jan, 2001) */
 589 
 590         //
 591         //  Next lets create some arrays, some that that hold transient
 592         //  information during the DFA build and some that are permament.
 593         //  These are kind of sticky since we cannot know how big they will
 594         //  get, but we don't want to use any Java collections because of
 595         //  performance.
 596         //
 597         //  Basically they will probably be about fLeafCount*2 on average,
 598         //  but can be as large as 2^(fLeafCount*2), worst case. So we start
 599         //  with fLeafCount*4 as a middle ground. This will be very unlikely
 600         //  to ever have to expand, though it if does, the overhead will be
 601         //  somewhat ugly.
 602         //
 603         int curArraySize = fLeafCount * 4;
 604         CMStateSet[] statesToDo = new CMStateSet[curArraySize];
 605         fFinalStateFlags = new boolean[curArraySize];
 606         fTransTable = new int[curArraySize][];
 607 
 608         //
 609         //  Ok we start with the initial set as the first pos set of the
 610         //  head node (which is the seq node that holds the content model
 611         //  and the EOC node.)
 612         //
 613         CMStateSet setT = fHeadNode.firstPos();
 614 
 615         //
 616         //  Init our two state flags. Basically the unmarked state counter
 617         //  is always chasing the current state counter. When it catches up,
 618         //  that means we made a pass through that did not add any new states
 619         //  to the lists, at which time we are done. We could have used a
 620         //  expanding array of flags which we used to mark off states as we
 621         //  complete them, but this is easier though less readable maybe.
 622         //
 623         int unmarkedState = 0;
 624         int curState = 0;
 625 
 626         //
 627         //  Init the first transition table entry, and put the initial state
 628         //  into the states to do list, then bump the current state.
 629         //
 630         fTransTable[curState] = makeDefStateList();
 631         statesToDo[curState] = setT;
 632         curState++;
 633 
 634             /* Optimization(Jan, 2001); This is faster for
 635              * a large content model such as, "(t001+|t002+|.... |t500+)".
 636              */
 637 
 638         HashMap stateTable = new HashMap();
 639 
 640             /* Optimization(Jan, 2001) */
 641 
 642         //
 643         //  Ok, almost done with the algorithm... We now enter the
 644         //  loop where we go until the states done counter catches up with
 645         //  the states to do counter.
 646         //
 647         while (unmarkedState < curState)
 648         {
 649             //
 650             //  Get the first unmarked state out of the list of states to do.
 651             //  And get the associated transition table entry.
 652             //
 653             setT = statesToDo[unmarkedState];
 654             int[] transEntry = fTransTable[unmarkedState];
 655 
 656             // Mark this one final if it contains the EOC state
 657             fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos);
 658 
 659             // Bump up the unmarked state count, marking this state done
 660             unmarkedState++;
 661 
 662             // Loop through each possible input symbol in the element map
 663             CMStateSet newSet = null;
 664             /* Optimization(Jan, 2001) */
 665             int sorterIndex = 0;
 666             /* Optimization(Jan, 2001) */
 667             for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
 668             {
 669                 //
 670                 //  Build up a set of states which is the union of all of
 671                 //  the follow sets of DFA positions that are in the current
 672                 //  state. If we gave away the new set last time through then
 673                 //  create a new one. Otherwise, zero out the existing one.
 674                 //
 675                 if (newSet == null)
 676                     newSet = new CMStateSet(fLeafCount);
 677                 else
 678                     newSet.zeroBits();
 679 
 680             /* Optimization(Jan, 2001) */
 681                 int leafIndex = fLeafSorter[sorterIndex++];
 682 
 683                 while (leafIndex != -1) {
 684                 // If this leaf index (DFA position) is in the current set...
 685                     if (setT.getBit(leafIndex))
 686                     {
 687                         //
 688                         //  If this leaf is the current input symbol, then we
 689                         //  want to add its follow list to the set of states to
 690                         //  transition to from the current state.
 691                         //
 692                                 newSet.union(fFollowList[leafIndex]);
 693                             }
 694 
 695                    leafIndex = fLeafSorter[sorterIndex++];
 696         }
 697             /* Optimization(Jan, 2001) */
 698 
 699                 //
 700                 //  If this new set is not empty, then see if its in the list
 701                 //  of states to do. If not, then add it.
 702                 //
 703                 if (!newSet.isEmpty())
 704                 {
 705                     //
 706                     //  Search the 'states to do' list to see if this new
 707                     //  state set is already in there.
 708                     //
 709 
 710             /* Optimization(Jan, 2001) */
 711             Integer stateObj = (Integer)stateTable.get(newSet);
 712             int stateIndex = (stateObj == null ? curState : stateObj.intValue());
 713             /* Optimization(Jan, 2001) */
 714 
 715                     // If we did not find it, then add it
 716                     if (stateIndex == curState)
 717                     {
 718                         //
 719                         //  Put this new state into the states to do and init
 720                         //  a new entry at the same index in the transition
 721                         //  table.
 722                         //
 723                         statesToDo[curState] = newSet;
 724                         fTransTable[curState] = makeDefStateList();
 725 
 726             /* Optimization(Jan, 2001) */
 727                         stateTable.put(newSet, new Integer(curState));
 728             /* Optimization(Jan, 2001) */
 729 
 730                         // We now have a new state to do so bump the count
 731                         curState++;
 732 
 733                         //
 734                         //  Null out the new set to indicate we adopted it.
 735                         //  This will cause the creation of a new set on the
 736                         //  next time around the loop.
 737                         //
 738                         newSet = null;
 739                     }
 740 
 741                     //
 742                     //  Now set this state in the transition table's entry
 743                     //  for this element (using its index), with the DFA
 744                     //  state we will move to from the current state when we
 745                     //  see this input element.
 746                     //
 747                     transEntry[elemIndex] = stateIndex;
 748 
 749                     // Expand the arrays if we're full
 750                     if (curState == curArraySize)
 751                     {
 752                         //
 753                         //  Yikes, we overflowed the initial array size, so
 754                         //  we've got to expand all of these arrays. So adjust
 755                         //  up the size by 50% and allocate new arrays.
 756                         //
 757                         final int newSize = (int)(curArraySize * 1.5);
 758                         CMStateSet[] newToDo = new CMStateSet[newSize];
 759                         boolean[] newFinalFlags = new boolean[newSize];
 760                         int[][] newTransTable = new int[newSize][];
 761 
 762                         // Copy over all of the existing content
 763                         System.arraycopy(statesToDo, 0, newToDo, 0, curArraySize);
 764                         System.arraycopy(fFinalStateFlags, 0, newFinalFlags, 0, curArraySize);
 765                         System.arraycopy(fTransTable, 0, newTransTable, 0, curArraySize);
 766 
 767                         // Store the new array size
 768                         curArraySize = newSize;
 769                         statesToDo = newToDo;
 770                         fFinalStateFlags = newFinalFlags;
 771                         fTransTable = newTransTable;
 772                     }
 773                 }
 774             }
 775         }
 776 
 777         // Check to see if we can set the fEmptyContentIsValid flag.
 778         fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable();
 779 
 780         //
 781         //  And now we can say bye bye to the temp representation since we've
 782         //  built the DFA.
 783         //
 784         if (DEBUG_VALIDATE_CONTENT)
 785             dumpTree(fHeadNode, 0);
 786         fHeadNode = null;
 787         fLeafList = null;
 788         fFollowList = null;
 789 
 790     }
 791 
 792     /**
 793      * Calculates the follow list of the current node.
 794      *
 795      * @param nodeCur The curent node.
 796      *
 797      * @exception CMException Thrown if follow list cannot be calculated.
 798      */
 799     private void calcFollowList(CMNode nodeCur)
 800     {
 801         // Recurse as required
 802         if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 803         {
 804             // Recurse only
 805             calcFollowList(((CMBinOp)nodeCur).getLeft());
 806             calcFollowList(((CMBinOp)nodeCur).getRight());
 807         }
 808          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)
 809         {
 810             // Recurse first
 811             calcFollowList(((CMBinOp)nodeCur).getLeft());
 812             calcFollowList(((CMBinOp)nodeCur).getRight());
 813 
 814             //
 815             //  Now handle our level. We use our left child's last pos
 816             //  set and our right child's first pos set, so go ahead and
 817             //  get them ahead of time.
 818             //
 819             final CMStateSet last  = ((CMBinOp)nodeCur).getLeft().lastPos();
 820             final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos();
 821 
 822             //
 823             //  Now, for every position which is in our left child's last set
 824             //  add all of the states in our right child's first set to the
 825             //  follow set for that position.
 826             //
 827             for (int index = 0; index < fLeafCount; index++)
 828             {
 829                 if (last.getBit(index))
 830                     fFollowList[index].union(first);
 831             }
 832         }
 833          /***
 834          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 835         {
 836             // Recurse first
 837             calcFollowList(((CMUniOp)nodeCur).getChild());
 838 
 839             //
 840             //  Now handle our level. We use our own first and last position
 841             //  sets, so get them up front.
 842             //
 843             final CMStateSet first = nodeCur.firstPos();
 844             final CMStateSet last  = nodeCur.lastPos();
 845 
 846             //
 847             //  For every position which is in our last position set, add all
 848             //  of our first position states to the follow set for that
 849             //  position.
 850             //
 851             for (int index = 0; index < fLeafCount; index++)
 852             {
 853                 if (last.getBit(index))
 854                     fFollowList[index].union(first);
 855             }
 856         }
 857          else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 858               ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE))
 859         {
 860             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 861         }
 862         /***/
 863          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
 864             || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 865         {
 866             // Recurse first
 867             calcFollowList(((CMUniOp)nodeCur).getChild());
 868 
 869             //
 870             //  Now handle our level. We use our own first and last position
 871             //  sets, so get them up front.
 872             //
 873             final CMStateSet first = nodeCur.firstPos();
 874             final CMStateSet last  = nodeCur.lastPos();
 875 
 876             //
 877             //  For every position which is in our last position set, add all
 878             //  of our first position states to the follow set for that
 879             //  position.
 880             //
 881             for (int index = 0; index < fLeafCount; index++)
 882             {
 883                 if (last.getBit(index))
 884                     fFollowList[index].union(first);
 885             }
 886         }
 887 
 888         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) {
 889             // Recurse only
 890             calcFollowList(((CMUniOp)nodeCur).getChild());
 891         }
 892          /***/
 893     }
 894 
 895     /**
 896      * Dumps the tree of the current node to standard output.
 897      *
 898      * @param nodeCur The current node.
 899      * @param level   The maximum levels to output.
 900      *
 901      * @exception CMException Thrown on error.
 902      */
 903     private void dumpTree(CMNode nodeCur, int level)
 904     {
 905         for (int index = 0; index < level; index++)
 906             System.out.print("   ");
 907 
 908         int type = nodeCur.type();
 909         if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 910         ||  (type == XMLContentSpec.CONTENTSPECNODE_SEQ))
 911         {
 912             if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 913                 System.out.print("Choice Node ");
 914             else
 915                 System.out.print("Seq Node ");
 916 
 917             if (nodeCur.isNullable())
 918                 System.out.print("Nullable ");
 919 
 920             System.out.print("firstPos=");
 921             System.out.print(nodeCur.firstPos().toString());
 922             System.out.print(" lastPos=");
 923             System.out.println(nodeCur.lastPos().toString());
 924 
 925             dumpTree(((CMBinOp)nodeCur).getLeft(), level+1);
 926             dumpTree(((CMBinOp)nodeCur).getRight(), level+1);
 927         }
 928          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 929         {
 930             System.out.print("Rep Node ");
 931 
 932             if (nodeCur.isNullable())
 933                 System.out.print("Nullable ");
 934 
 935             System.out.print("firstPos=");
 936             System.out.print(nodeCur.firstPos().toString());
 937             System.out.print(" lastPos=");
 938             System.out.println(nodeCur.lastPos().toString());
 939 
 940             dumpTree(((CMUniOp)nodeCur).getChild(), level+1);
 941         }
 942          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
 943         {
 944             System.out.print
 945             (
 946                 "Leaf: (pos="
 947                 + ((CMLeaf)nodeCur).getPosition()
 948                 + "), "
 949                 + ((CMLeaf)nodeCur).getElement()
 950                 + "(elemIndex="
 951                 + ((CMLeaf)nodeCur).getElement()
 952                 + ") "
 953             );
 954 
 955             if (nodeCur.isNullable())
 956                 System.out.print(" Nullable ");
 957 
 958             System.out.print("firstPos=");
 959             System.out.print(nodeCur.firstPos().toString());
 960             System.out.print(" lastPos=");
 961             System.out.println(nodeCur.lastPos().toString());
 962         }
 963          else
 964         {
 965             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 966         }
 967     }
 968 
 969 
 970     /**
 971      * -1 is used to represent bad transitions in the transition table
 972      * entry for each state. So each entry is initialized to an all -1
 973      * array. This method creates a new entry and initializes it.
 974      */
 975     private int[] makeDefStateList()
 976     {
 977         int[] retArray = new int[fElemMapSize];
 978         for (int index = 0; index < fElemMapSize; index++)
 979             retArray[index] = -1;
 980         return retArray;
 981     }
 982 
 983     /** Post tree build initialization. */
 984     private int postTreeBuildInit(CMNode nodeCur, int curIndex)
 985     {
 986         // Set the maximum states on this node
 987         nodeCur.setMaxStates(fLeafCount);
 988 
 989         // Recurse as required
 990         if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY ||
 991             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL ||
 992             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 993             // REVISIT: Don't waste these structures.
 994             QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI());
 995             fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition());
 996             fLeafListType[curIndex] = nodeCur.type();
 997             curIndex++;
 998         }
 999         else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
1000         ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ))
1001         {
1002             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex);
1003             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex);
1004         }
1005          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
1006              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE
1007              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)
1008         {
1009             curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex);
1010         }
1011          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
1012         {
1013             //
1014             //  Put this node in the leaf list at the current index if its
1015             //  a non-epsilon leaf.
1016             //
1017              final QName node = ((CMLeaf)nodeCur).getElement();
1018             if (node.localpart != fEpsilonString) {
1019                 fLeafList[curIndex] = (CMLeaf)nodeCur;
1020                 fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF;
1021                 curIndex++;
1022             }
1023         }
1024          else
1025         {
1026             throw new RuntimeException("ImplementationMessages.VAL_NIICM: type="+nodeCur.type());
1027         }
1028         return curIndex;
1029     }
1030 
1031 } // class DFAContentModel