1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * The Apache Software License, Version 1.1 7 * 8 * 9 * Copyright (c) 1999-2002 The Apache Software Foundation. All rights 10 * reserved. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in 21 * the documentation and/or other materials provided with the 22 * distribution. 23 * 24 * 3. The end-user documentation included with the redistribution, 25 * if any, must include the following acknowledgment: 26 * "This product includes software developed by the 27 * Apache Software Foundation (http://www.apache.org/)." 28 * Alternately, this acknowledgment may appear in the software itself, 29 * if and wherever such third-party acknowledgments normally appear. 30 * 31 * 4. The names "Xerces" and "Apache Software Foundation" must 32 * not be used to endorse or promote products derived from this 33 * software without prior written permission. For written 34 * permission, please contact apache@apache.org. 35 * 36 * 5. Products derived from this software may not be called "Apache", 37 * nor may "Apache" appear in their name, without prior written 38 * permission of the Apache Software Foundation. 39 * 40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 43 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 44 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * ==================================================================== 53 * 54 * This software consists of voluntary contributions made by many 55 * individuals on behalf of the Apache Software Foundation and was 56 * originally based on software copyright (c) 1999, International 57 * Business Machines, Inc., http://www.apache.org. For more 58 * information on the Apache Software Foundation, please see 59 * <http://www.apache.org/>. 60 */ 61 62 package com.sun.org.apache.xerces.internal.impl.dtd.models; 63 64 import java.util.HashMap; 65 66 import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec; 67 import com.sun.org.apache.xerces.internal.xni.QName; 68 69 /** 70 71 * DFAContentModel is the derivative of ContentModel that does 72 * all of the non-trivial element content validation. This class does 73 * the conversion from the regular expression to the DFA that 74 * it then uses in its validation algorithm. 75 * <p> 76 * <b>Note:</b> Upstream work insures that this class will never see 77 * a content model with PCDATA in it. Any model with PCDATA is 'mixed' 78 * and is handled via the MixedContentModel class since mixed models 79 * are very constrained in form and easily handled via a special case. 80 * This also makes implementation of this class much easier. 81 * 82 * @xerces.internal 83 * 84 */ 85 public class DFAContentModel 86 implements ContentModelValidator { 87 88 // 89 // Constants 90 // 91 // special strings 92 93 /** Epsilon string. */ 94 private static String fEpsilonString = "<<CMNODE_EPSILON>>"; 95 96 /** End-of-content string. */ 97 private static String fEOCString = "<<CMNODE_EOC>>"; 98 99 /** initializing static members **/ 100 static { 101 fEpsilonString = fEpsilonString.intern(); 102 fEOCString = fEOCString.intern(); 103 } 104 105 // debugging 106 107 /** Set to true to debug content model validation. */ 108 private static final boolean DEBUG_VALIDATE_CONTENT = false; 109 110 // 111 // Data 112 // 113 114 /* this is the EquivClassComparator object */ 115 //private EquivClassComparator comparator = null; 116 117 /** 118 * This is the map of unique input symbol elements to indices into 119 * each state's per-input symbol transition table entry. This is part 120 * of the built DFA information that must be kept around to do the 121 * actual validation. 122 */ 123 private QName fElemMap[] = null; 124 125 /** 126 * This is a map of whether the element map contains information 127 * related to ANY models. 128 */ 129 private int fElemMapType[] = null; 130 131 /** The element map size. */ 132 private int fElemMapSize = 0; 133 134 /** Boolean to distinguish Schema Mixed Content */ 135 private boolean fMixed; 136 137 /** 138 * The NFA position of the special EOC (end of content) node. This 139 * is saved away since it's used during the DFA build. 140 */ 141 private int fEOCPos = 0; 142 143 144 /** 145 * This is an array of booleans, one per state (there are 146 * fTransTableSize states in the DFA) that indicates whether that 147 * state is a final state. 148 */ 149 private boolean fFinalStateFlags[] = null; 150 151 /** 152 * The list of follow positions for each NFA position (i.e. for each 153 * non-epsilon leaf node.) This is only used during the building of 154 * the DFA, and is let go afterwards. 155 */ 156 private CMStateSet fFollowList[] = null; 157 158 /** 159 * This is the head node of our intermediate representation. It is 160 * only non-null during the building of the DFA (just so that it 161 * does not have to be passed all around.) Once the DFA is built, 162 * this is no longer required so its nulled out. 163 */ 164 private CMNode fHeadNode = null; 165 166 /** 167 * The count of leaf nodes. This is an important number that set some 168 * limits on the sizes of data structures in the DFA process. 169 */ 170 private int fLeafCount = 0; 171 172 /** 173 * An array of non-epsilon leaf nodes, which is used during the DFA 174 * build operation, then dropped. 175 */ 176 private CMLeaf fLeafList[] = null; 177 178 /** Array mapping ANY types to the leaf list. */ 179 private int fLeafListType[] = null; 180 181 //private ContentLeafNameTypeVector fLeafNameTypeVector = null; 182 183 /** 184 * The string pool of our parser session. This is set during construction 185 * and kept around. 186 */ 187 //private StringPool fStringPool = null; 188 189 /** 190 * This is the transition table that is the main by product of all 191 * of the effort here. It is an array of arrays of ints. The first 192 * dimension is the number of states we end up with in the DFA. The 193 * second dimensions is the number of unique elements in the content 194 * model (fElemMapSize). Each entry in the second dimension indicates 195 * the new state given that input for the first dimension's start 196 * state. 197 * <p> 198 * The fElemMap array handles mapping from element indexes to 199 * positions in the second dimension of the transition table. 200 */ 201 private int fTransTable[][] = null; 202 203 /** 204 * The number of valid entries in the transition table, and in the other 205 * related tables such as fFinalStateFlags. 206 */ 207 private int fTransTableSize = 0; 208 209 /** 210 * Flag that indicates that even though we have a "complicated" 211 * content model, it is valid to have no content. In other words, 212 * all parts of the content model are optional. For example: 213 * <pre> 214 * <!ELEMENT AllOptional (Optional*,NotRequired?)> 215 * </pre> 216 */ 217 private boolean fEmptyContentIsValid = false; 218 219 // temp variables 220 221 /** Temporary qualified name. */ 222 private final QName fQName = new QName(); 223 224 // 225 // Constructors 226 // 227 228 229 // 230 // Constructors 231 // 232 233 /** 234 * Constructs a DFA content model. 235 * 236 * @param syntaxTree The syntax tree of the content model. 237 * @param leafCount The number of leaves. 238 * @param mixed 239 * 240 */ 241 public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) { 242 // Store away our index and pools in members 243 //fStringPool = stringPool; 244 fLeafCount = leafCount; 245 246 247 // this is for Schema Mixed Content 248 fMixed = mixed; 249 250 // 251 // Ok, so lets grind through the building of the DFA. This method 252 // handles the high level logic of the algorithm, but it uses a 253 // number of helper classes to do its thing. 254 // 255 // In order to avoid having hundreds of references to the error and 256 // string handlers around, this guy and all of his helper classes 257 // just throw a simple exception and we then pass it along. 258 // 259 buildDFA(syntaxTree); 260 } 261 262 // 263 // ContentModelValidator methods 264 // 265 266 /** 267 * Check that the specified content is valid according to this 268 * content model. This method can also be called to do 'what if' 269 * testing of content models just to see if they would be valid. 270 * <p> 271 * A value of -1 in the children array indicates a PCDATA node. All other 272 * indexes will be positive and represent child elements. The count can be 273 * zero, since some elements have the EMPTY content model and that must be 274 * confirmed. 275 * 276 * @param children The children of this element. Each integer is an index within 277 * the <code>StringPool</code> of the child element name. An index 278 * of -1 is used to indicate an occurrence of non-whitespace character 279 * data. 280 * @param offset Offset into the array where the children starts. 281 * @param length The number of entries in the <code>children</code> array. 282 * 283 * @return The value -1 if fully valid, else the 0 based index of the child 284 * that first failed. If the value returned is equal to the number 285 * of children, then the specified children are valid but additional 286 * content is required to reach a valid ending state. 287 * 288 */ 289 public int validate(QName[] children, int offset, int length) { 290 291 if (DEBUG_VALIDATE_CONTENT) 292 System.out.println("DFAContentModel#validateContent"); 293 294 // 295 // A DFA content model must *always* have at least 1 child 296 // so a failure is given if no children present. 297 // 298 // Defect 782: This is an incorrect statement because a DFA 299 // content model is also used for constructions such as: 300 // 301 // (Optional*,NotRequired?) 302 // 303 // where a perfectly valid content would be NO CHILDREN. 304 // Therefore, if there are no children, we must check to 305 // see if the CMNODE_EOC marker is a valid start state! -Ac 306 // 307 if (length == 0) { 308 if (DEBUG_VALIDATE_CONTENT) { 309 System.out.println("!!! no children"); 310 System.out.println("elemMap="+fElemMap); 311 for (int i = 0; i < fElemMap.length; i++) { 312 String uri = fElemMap[i].uri; 313 String localpart = fElemMap[i].localpart; 314 315 System.out.println("fElemMap["+i+"]="+uri+","+ 316 localpart+" ("+ 317 uri+", "+ 318 localpart+ 319 ')'); 320 321 } 322 System.out.println("EOCIndex="+fEOCString); 323 } 324 325 return fEmptyContentIsValid ? -1 : 0; 326 327 } // if child count == 0 328 329 // 330 // Lets loop through the children in the array and move our way 331 // through the states. Note that we use the fElemMap array to map 332 // an element index to a state index. 333 // 334 int curState = 0; 335 for (int childIndex = 0; childIndex < length; childIndex++) 336 { 337 // Get the current element index out 338 final QName curElem = children[offset + childIndex]; 339 // ignore mixed text 340 if (fMixed && curElem.localpart == null) { 341 continue; 342 } 343 344 // Look up this child in our element map 345 int elemIndex = 0; 346 for (; elemIndex < fElemMapSize; elemIndex++) 347 { 348 int type = fElemMapType[elemIndex] & 0x0f ; 349 if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) { 350 //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]); 351 if (fElemMap[elemIndex].rawname == curElem.rawname) { 352 break; 353 } 354 } 355 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) { 356 String uri = fElemMap[elemIndex].uri; 357 if (uri == null || uri == curElem.uri) { 358 break; 359 } 360 } 361 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) { 362 if (curElem.uri == null) { 363 break; 364 } 365 } 366 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) { 367 if (fElemMap[elemIndex].uri != curElem.uri) { 368 break; 369 } 370 } 371 } 372 373 // If we didn't find it, then obviously not valid 374 if (elemIndex == fElemMapSize) { 375 if (DEBUG_VALIDATE_CONTENT) { 376 System.out.println("!!! didn't find it"); 377 378 System.out.println("curElem : " +curElem ); 379 for (int i=0; i<fElemMapSize; i++) { 380 System.out.println("fElemMap["+i+"] = " +fElemMap[i] ); 381 System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] ); 382 } 383 } 384 385 return childIndex; 386 } 387 388 // 389 // Look up the next state for this input symbol when in the 390 // current state. 391 // 392 curState = fTransTable[curState][elemIndex]; 393 394 // If its not a legal transition, then invalid 395 if (curState == -1) { 396 if (DEBUG_VALIDATE_CONTENT) 397 System.out.println("!!! not a legal transition"); 398 return childIndex; 399 } 400 } 401 402 // 403 // We transitioned all the way through the input list. However, that 404 // does not mean that we ended in a final state. So check whether 405 // our ending state is a final state. 406 // 407 if (DEBUG_VALIDATE_CONTENT) 408 System.out.println("curState="+curState+", childCount="+length); 409 if (!fFinalStateFlags[curState]) 410 return length; 411 412 // success! 413 return -1; 414 } // validate 415 416 417 // 418 // Private methods 419 // 420 421 /** 422 * Builds the internal DFA transition table from the given syntax tree. 423 * 424 * @param syntaxTree The syntax tree. 425 * 426 * @exception CMException Thrown if DFA cannot be built. 427 */ 428 private void buildDFA(CMNode syntaxTree) 429 { 430 // 431 // The first step we need to take is to rewrite the content model 432 // using our CMNode objects, and in the process get rid of any 433 // repetition short cuts, converting them into '*' style repetitions 434 // or getting rid of repetitions altogether. 435 // 436 // The conversions done are: 437 // 438 // x+ -> (x|x*) 439 // x? -> (x|epsilon) 440 // 441 // This is a relatively complex scenario. What is happening is that 442 // we create a top level binary node of which the special EOC value 443 // is set as the right side node. The the left side is set to the 444 // rewritten syntax tree. The source is the original content model 445 // info from the decl pool. The rewrite is done by buildSyntaxTree() 446 // which recurses the decl pool's content of the element and builds 447 // a new tree in the process. 448 // 449 // Note that, during this operation, we set each non-epsilon leaf 450 // node's DFA state position and count the number of such leafs, which 451 // is left in the fLeafCount member. 452 // 453 // The nodeTmp object is passed in just as a temp node to use during 454 // the recursion. Otherwise, we'd have to create a new node on every 455 // level of recursion, which would be piggy in Java (as is everything 456 // for that matter.) 457 // 458 459 /* MODIFIED (Jan, 2001) 460 * 461 * Use following rules. 462 * nullable(x+) := nullable(x), first(x+) := first(x), last(x+) := last(x) 463 * nullable(x?) := true, first(x?) := first(x), last(x?) := last(x) 464 * 465 * The same computation of follow as x* is applied to x+ 466 * 467 * The modification drastically reduces computation time of 468 * "(a, (b, a+, (c, (b, a+)+, a+, (d, (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+" 469 */ 470 471 fQName.setValues(null, fEOCString, fEOCString, null); 472 CMLeaf nodeEOC = new CMLeaf(fQName); 473 fHeadNode = new CMBinOp 474 ( 475 XMLContentSpec.CONTENTSPECNODE_SEQ 476 , syntaxTree 477 , nodeEOC 478 ); 479 480 // 481 // And handle specially the EOC node, which also must be numbered 482 // and counted as a non-epsilon leaf node. It could not be handled 483 // in the above tree build because it was created before all that 484 // started. We save the EOC position since its used during the DFA 485 // building loop. 486 // 487 fEOCPos = fLeafCount; 488 nodeEOC.setPosition(fLeafCount++); 489 490 // 491 // Ok, so now we have to iterate the new tree and do a little more 492 // work now that we know the leaf count. One thing we need to do is 493 // to calculate the first and last position sets of each node. This 494 // is cached away in each of the nodes. 495 // 496 // Along the way we also set the leaf count in each node as the 497 // maximum state count. They must know this in order to create their 498 // first/last pos sets. 499 // 500 // We also need to build an array of references to the non-epsilon 501 // leaf nodes. Since we iterate it in the same way as before, this 502 // will put them in the array according to their position values. 503 // 504 fLeafList = new CMLeaf[fLeafCount]; 505 fLeafListType = new int[fLeafCount]; 506 postTreeBuildInit(fHeadNode, 0); 507 508 // 509 // And, moving onward... We now need to build the follow position 510 // sets for all the nodes. So we allocate an array of state sets, 511 // one for each leaf node (i.e. each DFA position.) 512 // 513 fFollowList = new CMStateSet[fLeafCount]; 514 for (int index = 0; index < fLeafCount; index++) 515 fFollowList[index] = new CMStateSet(fLeafCount); 516 calcFollowList(fHeadNode); 517 // 518 // And finally the big push... Now we build the DFA using all the 519 // states and the tree we've built up. First we set up the various 520 // data structures we are going to use while we do this. 521 // 522 // First of all we need an array of unique element names in our 523 // content model. For each transition table entry, we need a set of 524 // contiguous indices to represent the transitions for a particular 525 // input element. So we need to a zero based range of indexes that 526 // map to element types. This element map provides that mapping. 527 // 528 fElemMap = new QName[fLeafCount]; 529 fElemMapType = new int[fLeafCount]; 530 fElemMapSize = 0; 531 for (int outIndex = 0; outIndex < fLeafCount; outIndex++) 532 { 533 fElemMap[outIndex] = new QName(); 534 535 /**** 536 if ( (fLeafListType[outIndex] & 0x0f) != 0 ) { 537 if (fLeafNameTypeVector == null) { 538 fLeafNameTypeVector = new ContentLeafNameTypeVector(); 539 } 540 } 541 /****/ 542 543 // Get the current leaf's element index 544 final QName element = fLeafList[outIndex].getElement(); 545 546 // See if the current leaf node's element index is in the list 547 int inIndex = 0; 548 for (; inIndex < fElemMapSize; inIndex++) 549 { 550 if (fElemMap[inIndex].rawname == element.rawname) { 551 break; 552 } 553 } 554 555 // If it was not in the list, then add it, if not the EOC node 556 if (inIndex == fElemMapSize) { 557 fElemMap[fElemMapSize].setValues(element); 558 fElemMapType[fElemMapSize] = fLeafListType[outIndex]; 559 fElemMapSize++; 560 } 561 } 562 // set up the fLeafNameTypeVector object if there is one. 563 /***** 564 if (fLeafNameTypeVector != null) { 565 fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize); 566 } 567 568 /*** 569 * Optimization(Jan, 2001); We sort fLeafList according to 570 * elemIndex which is *uniquely* associated to each leaf. 571 * We are *assuming* that each element appears in at least one leaf. 572 **/ 573 574 int[] fLeafSorter = new int[fLeafCount + fElemMapSize]; 575 int fSortCount = 0; 576 577 for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { 578 for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { 579 final QName leaf = fLeafList[leafIndex].getElement(); 580 final QName element = fElemMap[elemIndex]; 581 if (leaf.rawname == element.rawname) { 582 fLeafSorter[fSortCount++] = leafIndex; 583 } 584 } 585 fLeafSorter[fSortCount++] = -1; 586 } 587 588 /* Optimization(Jan, 2001) */ 589 590 // 591 // Next lets create some arrays, some that that hold transient 592 // information during the DFA build and some that are permament. 593 // These are kind of sticky since we cannot know how big they will 594 // get, but we don't want to use any Java collections because of 595 // performance. 596 // 597 // Basically they will probably be about fLeafCount*2 on average, 598 // but can be as large as 2^(fLeafCount*2), worst case. So we start 599 // with fLeafCount*4 as a middle ground. This will be very unlikely 600 // to ever have to expand, though it if does, the overhead will be 601 // somewhat ugly. 602 // 603 int curArraySize = fLeafCount * 4; 604 CMStateSet[] statesToDo = new CMStateSet[curArraySize]; 605 fFinalStateFlags = new boolean[curArraySize]; 606 fTransTable = new int[curArraySize][]; 607 608 // 609 // Ok we start with the initial set as the first pos set of the 610 // head node (which is the seq node that holds the content model 611 // and the EOC node.) 612 // 613 CMStateSet setT = fHeadNode.firstPos(); 614 615 // 616 // Init our two state flags. Basically the unmarked state counter 617 // is always chasing the current state counter. When it catches up, 618 // that means we made a pass through that did not add any new states 619 // to the lists, at which time we are done. We could have used a 620 // expanding array of flags which we used to mark off states as we 621 // complete them, but this is easier though less readable maybe. 622 // 623 int unmarkedState = 0; 624 int curState = 0; 625 626 // 627 // Init the first transition table entry, and put the initial state 628 // into the states to do list, then bump the current state. 629 // 630 fTransTable[curState] = makeDefStateList(); 631 statesToDo[curState] = setT; 632 curState++; 633 634 /* Optimization(Jan, 2001); This is faster for 635 * a large content model such as, "(t001+|t002+|.... |t500+)". 636 */ 637 638 HashMap stateTable = new HashMap(); 639 640 /* Optimization(Jan, 2001) */ 641 642 // 643 // Ok, almost done with the algorithm... We now enter the 644 // loop where we go until the states done counter catches up with 645 // the states to do counter. 646 // 647 while (unmarkedState < curState) 648 { 649 // 650 // Get the first unmarked state out of the list of states to do. 651 // And get the associated transition table entry. 652 // 653 setT = statesToDo[unmarkedState]; 654 int[] transEntry = fTransTable[unmarkedState]; 655 656 // Mark this one final if it contains the EOC state 657 fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos); 658 659 // Bump up the unmarked state count, marking this state done 660 unmarkedState++; 661 662 // Loop through each possible input symbol in the element map 663 CMStateSet newSet = null; 664 /* Optimization(Jan, 2001) */ 665 int sorterIndex = 0; 666 /* Optimization(Jan, 2001) */ 667 for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) 668 { 669 // 670 // Build up a set of states which is the union of all of 671 // the follow sets of DFA positions that are in the current 672 // state. If we gave away the new set last time through then 673 // create a new one. Otherwise, zero out the existing one. 674 // 675 if (newSet == null) 676 newSet = new CMStateSet(fLeafCount); 677 else 678 newSet.zeroBits(); 679 680 /* Optimization(Jan, 2001) */ 681 int leafIndex = fLeafSorter[sorterIndex++]; 682 683 while (leafIndex != -1) { 684 // If this leaf index (DFA position) is in the current set... 685 if (setT.getBit(leafIndex)) 686 { 687 // 688 // If this leaf is the current input symbol, then we 689 // want to add its follow list to the set of states to 690 // transition to from the current state. 691 // 692 newSet.union(fFollowList[leafIndex]); 693 } 694 695 leafIndex = fLeafSorter[sorterIndex++]; 696 } 697 /* Optimization(Jan, 2001) */ 698 699 // 700 // If this new set is not empty, then see if its in the list 701 // of states to do. If not, then add it. 702 // 703 if (!newSet.isEmpty()) 704 { 705 // 706 // Search the 'states to do' list to see if this new 707 // state set is already in there. 708 // 709 710 /* Optimization(Jan, 2001) */ 711 Integer stateObj = (Integer)stateTable.get(newSet); 712 int stateIndex = (stateObj == null ? curState : stateObj.intValue()); 713 /* Optimization(Jan, 2001) */ 714 715 // If we did not find it, then add it 716 if (stateIndex == curState) 717 { 718 // 719 // Put this new state into the states to do and init 720 // a new entry at the same index in the transition 721 // table. 722 // 723 statesToDo[curState] = newSet; 724 fTransTable[curState] = makeDefStateList(); 725 726 /* Optimization(Jan, 2001) */ 727 stateTable.put(newSet, new Integer(curState)); 728 /* Optimization(Jan, 2001) */ 729 730 // We now have a new state to do so bump the count 731 curState++; 732 733 // 734 // Null out the new set to indicate we adopted it. 735 // This will cause the creation of a new set on the 736 // next time around the loop. 737 // 738 newSet = null; 739 } 740 741 // 742 // Now set this state in the transition table's entry 743 // for this element (using its index), with the DFA 744 // state we will move to from the current state when we 745 // see this input element. 746 // 747 transEntry[elemIndex] = stateIndex; 748 749 // Expand the arrays if we're full 750 if (curState == curArraySize) 751 { 752 // 753 // Yikes, we overflowed the initial array size, so 754 // we've got to expand all of these arrays. So adjust 755 // up the size by 50% and allocate new arrays. 756 // 757 final int newSize = (int)(curArraySize * 1.5); 758 CMStateSet[] newToDo = new CMStateSet[newSize]; 759 boolean[] newFinalFlags = new boolean[newSize]; 760 int[][] newTransTable = new int[newSize][]; 761 762 // Copy over all of the existing content 763 System.arraycopy(statesToDo, 0, newToDo, 0, curArraySize); 764 System.arraycopy(fFinalStateFlags, 0, newFinalFlags, 0, curArraySize); 765 System.arraycopy(fTransTable, 0, newTransTable, 0, curArraySize); 766 767 // Store the new array size 768 curArraySize = newSize; 769 statesToDo = newToDo; 770 fFinalStateFlags = newFinalFlags; 771 fTransTable = newTransTable; 772 } 773 } 774 } 775 } 776 777 // Check to see if we can set the fEmptyContentIsValid flag. 778 fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable(); 779 780 // 781 // And now we can say bye bye to the temp representation since we've 782 // built the DFA. 783 // 784 if (DEBUG_VALIDATE_CONTENT) 785 dumpTree(fHeadNode, 0); 786 fHeadNode = null; 787 fLeafList = null; 788 fFollowList = null; 789 790 } 791 792 /** 793 * Calculates the follow list of the current node. 794 * 795 * @param nodeCur The curent node. 796 * 797 * @exception CMException Thrown if follow list cannot be calculated. 798 */ 799 private void calcFollowList(CMNode nodeCur) 800 { 801 // Recurse as required 802 if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE) 803 { 804 // Recurse only 805 calcFollowList(((CMBinOp)nodeCur).getLeft()); 806 calcFollowList(((CMBinOp)nodeCur).getRight()); 807 } 808 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ) 809 { 810 // Recurse first 811 calcFollowList(((CMBinOp)nodeCur).getLeft()); 812 calcFollowList(((CMBinOp)nodeCur).getRight()); 813 814 // 815 // Now handle our level. We use our left child's last pos 816 // set and our right child's first pos set, so go ahead and 817 // get them ahead of time. 818 // 819 final CMStateSet last = ((CMBinOp)nodeCur).getLeft().lastPos(); 820 final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos(); 821 822 // 823 // Now, for every position which is in our left child's last set 824 // add all of the states in our right child's first set to the 825 // follow set for that position. 826 // 827 for (int index = 0; index < fLeafCount; index++) 828 { 829 if (last.getBit(index)) 830 fFollowList[index].union(first); 831 } 832 } 833 /*** 834 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE) 835 { 836 // Recurse first 837 calcFollowList(((CMUniOp)nodeCur).getChild()); 838 839 // 840 // Now handle our level. We use our own first and last position 841 // sets, so get them up front. 842 // 843 final CMStateSet first = nodeCur.firstPos(); 844 final CMStateSet last = nodeCur.lastPos(); 845 846 // 847 // For every position which is in our last position set, add all 848 // of our first position states to the follow set for that 849 // position. 850 // 851 for (int index = 0; index < fLeafCount; index++) 852 { 853 if (last.getBit(index)) 854 fFollowList[index].union(first); 855 } 856 } 857 else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE) 858 || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)) 859 { 860 throw new RuntimeException("ImplementationMessages.VAL_NIICM"); 861 } 862 /***/ 863 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE 864 || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE) 865 { 866 // Recurse first 867 calcFollowList(((CMUniOp)nodeCur).getChild()); 868 869 // 870 // Now handle our level. We use our own first and last position 871 // sets, so get them up front. 872 // 873 final CMStateSet first = nodeCur.firstPos(); 874 final CMStateSet last = nodeCur.lastPos(); 875 876 // 877 // For every position which is in our last position set, add all 878 // of our first position states to the follow set for that 879 // position. 880 // 881 for (int index = 0; index < fLeafCount; index++) 882 { 883 if (last.getBit(index)) 884 fFollowList[index].union(first); 885 } 886 } 887 888 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) { 889 // Recurse only 890 calcFollowList(((CMUniOp)nodeCur).getChild()); 891 } 892 /***/ 893 } 894 895 /** 896 * Dumps the tree of the current node to standard output. 897 * 898 * @param nodeCur The current node. 899 * @param level The maximum levels to output. 900 * 901 * @exception CMException Thrown on error. 902 */ 903 private void dumpTree(CMNode nodeCur, int level) 904 { 905 for (int index = 0; index < level; index++) 906 System.out.print(" "); 907 908 int type = nodeCur.type(); 909 if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE) 910 || (type == XMLContentSpec.CONTENTSPECNODE_SEQ)) 911 { 912 if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE) 913 System.out.print("Choice Node "); 914 else 915 System.out.print("Seq Node "); 916 917 if (nodeCur.isNullable()) 918 System.out.print("Nullable "); 919 920 System.out.print("firstPos="); 921 System.out.print(nodeCur.firstPos().toString()); 922 System.out.print(" lastPos="); 923 System.out.println(nodeCur.lastPos().toString()); 924 925 dumpTree(((CMBinOp)nodeCur).getLeft(), level+1); 926 dumpTree(((CMBinOp)nodeCur).getRight(), level+1); 927 } 928 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE) 929 { 930 System.out.print("Rep Node "); 931 932 if (nodeCur.isNullable()) 933 System.out.print("Nullable "); 934 935 System.out.print("firstPos="); 936 System.out.print(nodeCur.firstPos().toString()); 937 System.out.print(" lastPos="); 938 System.out.println(nodeCur.lastPos().toString()); 939 940 dumpTree(((CMUniOp)nodeCur).getChild(), level+1); 941 } 942 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF) 943 { 944 System.out.print 945 ( 946 "Leaf: (pos=" 947 + ((CMLeaf)nodeCur).getPosition() 948 + "), " 949 + ((CMLeaf)nodeCur).getElement() 950 + "(elemIndex=" 951 + ((CMLeaf)nodeCur).getElement() 952 + ") " 953 ); 954 955 if (nodeCur.isNullable()) 956 System.out.print(" Nullable "); 957 958 System.out.print("firstPos="); 959 System.out.print(nodeCur.firstPos().toString()); 960 System.out.print(" lastPos="); 961 System.out.println(nodeCur.lastPos().toString()); 962 } 963 else 964 { 965 throw new RuntimeException("ImplementationMessages.VAL_NIICM"); 966 } 967 } 968 969 970 /** 971 * -1 is used to represent bad transitions in the transition table 972 * entry for each state. So each entry is initialized to an all -1 973 * array. This method creates a new entry and initializes it. 974 */ 975 private int[] makeDefStateList() 976 { 977 int[] retArray = new int[fElemMapSize]; 978 for (int index = 0; index < fElemMapSize; index++) 979 retArray[index] = -1; 980 return retArray; 981 } 982 983 /** Post tree build initialization. */ 984 private int postTreeBuildInit(CMNode nodeCur, int curIndex) 985 { 986 // Set the maximum states on this node 987 nodeCur.setMaxStates(fLeafCount); 988 989 // Recurse as required 990 if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY || 991 (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL || 992 (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) { 993 // REVISIT: Don't waste these structures. 994 QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI()); 995 fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition()); 996 fLeafListType[curIndex] = nodeCur.type(); 997 curIndex++; 998 } 999 else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE) 1000 || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)) 1001 { 1002 curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex); 1003 curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex); 1004 } 1005 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE 1006 || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE 1007 || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) 1008 { 1009 curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex); 1010 } 1011 else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF) 1012 { 1013 // 1014 // Put this node in the leaf list at the current index if its 1015 // a non-epsilon leaf. 1016 // 1017 final QName node = ((CMLeaf)nodeCur).getElement(); 1018 if (node.localpart != fEpsilonString) { 1019 fLeafList[curIndex] = (CMLeaf)nodeCur; 1020 fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF; 1021 curIndex++; 1022 } 1023 } 1024 else 1025 { 1026 throw new RuntimeException("ImplementationMessages.VAL_NIICM: type="+nodeCur.type()); 1027 } 1028 return curIndex; 1029 } 1030 1031 } // class DFAContentModel