--- /dev/null 2012-12-18 12:25:05.000000000 +0000 +++ new/src/share/classes/jdk/internal/util/xml/impl/Parser.java 2012-12-18 12:25:05.000000000 +0000 @@ -0,0 +1,3372 @@ +/* + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.util.xml.impl; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Map; +import jdk.internal.org.xml.sax.InputSource; +import jdk.internal.org.xml.sax.SAXException; + +/** + * XML non-validating parser engine. + */ +public abstract class Parser { + + public final static String FAULT = ""; + protected final static int BUFFSIZE_READER = 512; + protected final static int BUFFSIZE_PARSER = 128; + /** + * The end of stream character. + */ + public final static char EOS = 0xffff; + private Pair mNoNS; // there is no namespace + private Pair mXml; // the xml namespace + private Map mEnt; // the entities look up table + private Map mPEnt; // the parmeter entities look up table + protected boolean mIsSAlone; // xml decl standalone flag + protected boolean mIsSAloneSet; // standalone is explicitely set + protected boolean mIsNSAware; // if true - namespace aware mode + protected int mPh; // current phase of document processing + protected final static int PH_BEFORE_DOC = -1; // before parsing + protected final static int PH_DOC_START = 0; // document start + protected final static int PH_MISC_DTD = 1; // misc before DTD + protected final static int PH_DTD = 2; // DTD + protected final static int PH_DTD_MISC = 3; // misc after DTD + protected final static int PH_DOCELM = 4; // document's element + protected final static int PH_DOCELM_MISC = 5; // misc after element + protected final static int PH_AFTER_DOC = 6; // after parsing + protected int mEvt; // current event type + protected final static int EV_NULL = 0; // unknown + protected final static int EV_ELM = 1; // empty element + protected final static int EV_ELMS = 2; // start element + protected final static int EV_ELME = 3; // end element + protected final static int EV_TEXT = 4; // textual content + protected final static int EV_WSPC = 5; // white space content + protected final static int EV_PI = 6; // processing instruction + protected final static int EV_CDAT = 7; // character data + protected final static int EV_COMM = 8; // comment + protected final static int EV_DTD = 9; // document type definition + protected final static int EV_ENT = 10; // skipped entity + private char mESt; // built-in entity recognizer state + // mESt values: + // 0x100 : the initial state + // > 0x100 : unrecognized name + // < 0x100 : replacement character + protected char[] mBuff; // parser buffer + protected int mBuffIdx; // index of the last char + protected Pair mPref; // stack of prefixes + protected Pair mElm; // stack of elements + // mAttL.chars - element qname + // mAttL.next - next element + // mAttL.list - list of attributes defined on this element + // mAttL.list.chars - attribute qname + // mAttL.list.id - a char representing attribute's type see below + // mAttL.list.next - next attribute defined on the element + // mAttL.list.list - devault value structure or null + // mAttL.list.list.chars - "name='value' " chars array for Input + // + // Attribute type character values: + // 'i' - "ID" + // 'r' - "IDREF" + // 'R' - "IDREFS" + // 'n' - "ENTITY" + // 'N' - "ENTITIES" + // 't' - "NMTOKEN" + // 'T' - "NMTOKENS" + // 'u' - enumeration type + // 'o' - "NOTATION" + // 'c' - "CDATA" + // see also: bkeyword() and atype() + // + protected Pair mAttL; // list of defined attrs by element name + protected Input mDoc; // document entity + protected Input mInp; // stack of entities + private char[] mChars; // reading buffer + private int mChLen; // current capacity + private int mChIdx; // index to the next char + protected Attrs mAttrs; // attributes of the curr. element + private String[] mItems; // attributes array of the curr. element + private char mAttrIdx; // attributes counter/index + private String mUnent; // unresolved entity name + private Pair mDltd; // deleted objects for reuse + /** + * Default prefixes + */ + private final static char NONS[]; + private final static char XML[]; + private final static char XMLNS[]; + + static { + NONS = new char[1]; + NONS[0] = (char) 0; + + XML = new char[4]; + XML[0] = (char) 4; + XML[1] = 'x'; + XML[2] = 'm'; + XML[3] = 'l'; + + XMLNS = new char[6]; + XMLNS[0] = (char) 6; + XMLNS[1] = 'x'; + XMLNS[2] = 'm'; + XMLNS[3] = 'l'; + XMLNS[4] = 'n'; + XMLNS[5] = 's'; + } + /** + * ASCII character type array. + * + * This array maps an ASCII (7 bit) character to the character type.
+ * Possible character type values are:
- ' ' for any kind of white + * space character;
- 'a' for any lower case alphabetical character + * value;
- 'A' for any upper case alphabetical character value;
+ * - 'd' for any decimal digit character value;
- 'z' for any + * character less then ' ' except '\t', '\n', '\r';
An ASCII (7 bit) + * character which does not fall in any category listed above is mapped to + * it self. + */ + private static final byte asctyp[]; + /** + * NMTOKEN character type array. + * + * This array maps an ASCII (7 bit) character to the character type.
+ * Possible character type values are:
- 0 for underscore ('_') or any + * lower and upper case alphabetical character value;
- 1 for colon + * (':') character;
- 2 for dash ('-') and dot ('.') or any decimal + * digit character value;
- 3 for any kind of white space character
An ASCII (7 bit) character which does not fall in any category listed + * above is mapped to 0xff. + */ + private static final byte nmttyp[]; + + /** + * Static constructor. + * + * Sets up the ASCII character type array which is used by + * {@link #asctyp asctyp} method and NMTOKEN character type array. + */ + static { + short i = 0; + + asctyp = new byte[0x80]; + while (i < ' ') { + asctyp[i++] = (byte) 'z'; + } + asctyp['\t'] = (byte) ' '; + asctyp['\r'] = (byte) ' '; + asctyp['\n'] = (byte) ' '; + while (i < '0') { + asctyp[i] = (byte) i++; + } + while (i <= '9') { + asctyp[i++] = (byte) 'd'; + } + while (i < 'A') { + asctyp[i] = (byte) i++; + } + while (i <= 'Z') { + asctyp[i++] = (byte) 'A'; + } + while (i < 'a') { + asctyp[i] = (byte) i++; + } + while (i <= 'z') { + asctyp[i++] = (byte) 'a'; + } + while (i < 0x80) { + asctyp[i] = (byte) i++; + } + + nmttyp = new byte[0x80]; + for (i = 0; i < '0'; i++) { + nmttyp[i] = (byte) 0xff; + } + while (i <= '9') { + nmttyp[i++] = (byte) 2; // digits + } + while (i < 'A') { + nmttyp[i++] = (byte) 0xff; + } + // skiped upper case alphabetical character are already 0 + for (i = '['; i < 'a'; i++) { + nmttyp[i] = (byte) 0xff; + } + // skiped lower case alphabetical character are already 0 + for (i = '{'; i < 0x80; i++) { + nmttyp[i] = (byte) 0xff; + } + nmttyp['_'] = 0; + nmttyp[':'] = 1; + nmttyp['.'] = 2; + nmttyp['-'] = 2; + nmttyp[' '] = 3; + nmttyp['\t'] = 3; + nmttyp['\r'] = 3; + nmttyp['\n'] = 3; + } + + /** + * Constructor. + */ + protected Parser() { + mPh = PH_BEFORE_DOC; // before parsing + + // Initialize the parser + mBuff = new char[BUFFSIZE_PARSER]; + mAttrs = new Attrs(); + + // Default namespace + mPref = pair(mPref); + mPref.name = ""; + mPref.value = ""; + mPref.chars = NONS; + mNoNS = mPref; // no namespace + // XML namespace + mPref = pair(mPref); + mPref.name = "xml"; + mPref.value = "http://www.w3.org/XML/1998/namespace"; + mPref.chars = XML; + mXml = mPref; // XML namespace + } + + /** + * Initializes parser's internals. Note, current input has to be set before + * this method is called. + */ + protected void init() { + mUnent = null; + mElm = null; + mPref = mXml; + mAttL = null; + mPEnt = new HashMap(); + mEnt = new HashMap(); + mDoc = mInp; // current input is document entity + mChars = mInp.chars; // use document entity buffer + mPh = PH_DOC_START; // the begining of the document + } + + /** + * Cleans up parser internal resources. + */ + protected void cleanup() { + // Default attributes + while (mAttL != null) { + while (mAttL.list != null) { + if (mAttL.list.list != null) { + del(mAttL.list.list); + } + mAttL.list = del(mAttL.list); + } + mAttL = del(mAttL); + } + // Element stack + while (mElm != null) { + mElm = del(mElm); + } + // Namespace prefixes + while (mPref != mXml) { + mPref = del(mPref); + } + // Inputs + while (mInp != null) { + pop(); + } + // Document reader + if ((mDoc != null) && (mDoc.src != null)) { + try { + mDoc.src.close(); + } catch (IOException ioe) { + } + } + mPEnt = null; + mEnt = null; + mDoc = null; + mPh = PH_AFTER_DOC; // before documnet processing + } + + /** + * Processes a portion of document. This method returns one of EV_* + * constants as an identifier of the portion of document have been read. + * + * @return Identifier of processed document portion. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + protected int step() + throws Exception { + mEvt = EV_NULL; + int st = 0; + while (mEvt == EV_NULL) { + char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); + switch (st) { + case 0: // all sorts of markup (dispetcher) + if (ch != '<') { + bkch(); + mBuffIdx = -1; // clean parser buffer + st = 1; + break; + } + switch (getch()) { + case '/': // the end of the element content + mEvt = EV_ELME; + if (mElm == null) { + panic(FAULT); + } + // Check element's open/close tags balance + mBuffIdx = -1; // clean parser buffer + bname(mIsNSAware); + char[] chars = mElm.chars; + if (chars.length == (mBuffIdx + 1)) { + for (char i = 1; i <= mBuffIdx; i += 1) { + if (chars[i] != mBuff[i]) { + panic(FAULT); + } + } + } else { + panic(FAULT); + } + // Skip white spaces before '>' + if (wsskip() != '>') { + panic(FAULT); + } + getch(); // read '>' + break; + + case '!': // a comment or a CDATA + ch = getch(); + bkch(); + switch (ch) { + case '-': // must be a comment + mEvt = EV_COMM; + comm(); + break; + + case '[': // must be a CDATA section + mEvt = EV_CDAT; + cdat(); + break; + + default: // must be 'DOCTYPE' + mEvt = EV_DTD; + dtd(); + break; + } + break; + + case '?': // processing instruction + mEvt = EV_PI; + pi(); + break; + + default: // must be the first char of an xml name + bkch(); + // Read an element name and put it on top of the + // element stack + mElm = pair(mElm); // add new element to the stack + mElm.chars = qname(mIsNSAware); + mElm.name = mElm.local(); + mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags + mElm.num = 0; // namespace counter + // Find the list of defined attributs of the current + // element + Pair elm = find(mAttL, mElm.chars); + mElm.list = (elm != null) ? elm.list : null; + // Read attributes till the end of the element tag + mAttrIdx = 0; + Pair att = pair(null); + att.num = 0; // clear attribute's flags + attr(att); // get all attributes inc. defaults + del(att); + mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; + // Skip white spaces before '>' + switch (wsskip()) { + case '>': + getch(); // read '>' + mEvt = EV_ELMS; + break; + + case '/': + getch(); // read '/' + if (getch() != '>') // read '>' + { + panic(FAULT); + } + mEvt = EV_ELM; + break; + + default: + panic(FAULT); + } + break; + } + break; + + case 1: // read white space + switch (ch) { + case ' ': + case '\t': + case '\n': + bappend(ch); + break; + + case '\r': // EOL processing [#2.11] + if (getch() != '\n') { + bkch(); + } + bappend('\n'); + break; + + case '<': + mEvt = EV_WSPC; + bkch(); + bflash_ws(); + break; + + default: + bkch(); + st = 2; + break; + } + break; + + case 2: // read the text content of the element + switch (ch) { + case '&': + if (mUnent == null) { + // There was no unresolved entity on previous step. + if ((mUnent = ent('x')) != null) { + mEvt = EV_TEXT; + bkch(); // move back to ';' after entity name + setch('&'); // parser must be back on next step + bflash(); + } + } else { + // There was unresolved entity on previous step. + mEvt = EV_ENT; + skippedEnt(mUnent); + mUnent = null; + } + break; + + case '<': + mEvt = EV_TEXT; + bkch(); + bflash(); + break; + + case '\r': // EOL processing [#2.11] + if (getch() != '\n') { + bkch(); + } + bappend('\n'); + break; + + case EOS: + panic(FAULT); + + default: + bappend(ch); + break; + } + break; + + default: + panic(FAULT); + } + } + + return mEvt; + } + + /** + * Parses the document type declaration. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtd() + throws Exception { + char ch; + String str = null; + String name = null; + Pair psid = null; + // read 'DOCTYPE' + if ("DOCTYPE".equals(name(false)) != true) { + panic(FAULT); + } + mPh = PH_DTD; // DTD + for (short st = 0; st >= 0;) { + ch = getch(); + switch (st) { + case 0: // read the document type name + if (chtyp(ch) != ' ') { + bkch(); + name = name(mIsNSAware); + wsskip(); + st = 1; // read 'PUPLIC' or 'SYSTEM' + } + break; + + case 1: // read 'PUPLIC' or 'SYSTEM' + switch (chtyp(ch)) { + case 'A': + bkch(); + psid = pubsys(' '); + st = 2; // skip spaces before internal subset + docType(name, psid.name, psid.value); + break; + + case '[': + bkch(); + st = 2; // skip spaces before internal subset + docType(name, null, null); + break; + + case '>': + bkch(); + st = 3; // skip spaces after internal subset + docType(name, null, null); + break; + + default: + panic(FAULT); + } + break; + + case 2: // skip spaces before internal subset + switch (chtyp(ch)) { + case '[': + // Process internal subset + dtdsub(); + st = 3; // skip spaces after internal subset + break; + + case '>': + // There is no internal subset + bkch(); + st = 3; // skip spaces after internal subset + break; + + case ' ': + // skip white spaces + break; + + default: + panic(FAULT); + } + break; + + case 3: // skip spaces after internal subset + switch (chtyp(ch)) { + case '>': + if (psid != null) { + // Report the DTD external subset + InputSource is = resolveEnt(name, psid.name, psid.value); + if (is != null) { + if (mIsSAlone == false) { + // Set the end of DTD external subset char + bkch(); + setch(']'); + // Set the DTD external subset InputSource + push(new Input(BUFFSIZE_READER)); + setinp(is); + mInp.pubid = psid.name; + mInp.sysid = psid.value; + // Parse the DTD external subset + dtdsub(); + } else { + // Unresolved DTD external subset + skippedEnt("[dtd]"); + // Release reader and stream + if (is.getCharacterStream() != null) { + try { + is.getCharacterStream().close(); + } catch (IOException ioe) { + } + } + if (is.getByteStream() != null) { + try { + is.getByteStream().close(); + } catch (IOException ioe) { + } + } + } + } else { + // Unresolved DTD external subset + skippedEnt("[dtd]"); + } + del(psid); + } + st = -1; // end of DTD + break; + + case ' ': + // skip white spaces + break; + + default: + panic(FAULT); + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Parses the document type declaration subset. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdsub() + throws Exception { + char ch; + for (short st = 0; st >= 0;) { + ch = getch(); + switch (st) { + case 0: // skip white spaces before a declaration + switch (chtyp(ch)) { + case '<': + ch = getch(); + switch (ch) { + case '?': + pi(); + break; + + case '!': + ch = getch(); + bkch(); + if (ch == '-') { + comm(); + break; + } + // A markup or an entity declaration + bntok(); + switch (bkeyword()) { + case 'n': + dtdent(); + break; + + case 'a': + dtdattl(); // parse attributes declaration + break; + + case 'e': + dtdelm(); // parse element declaration + break; + + case 'o': + dtdnot(); // parse notation declaration + break; + + default: + panic(FAULT); // unsupported markup declaration + break; + } + st = 1; // read the end of declaration + break; + + default: + panic(FAULT); + break; + } + break; + + case '%': + // A parameter entity reference + pent(' '); + break; + + case ']': + // End of DTD subset + st = -1; + break; + + case ' ': + // Skip white spaces + break; + + case 'Z': + // End of stream + if (getch() != ']') { + panic(FAULT); + } + st = -1; + break; + + default: + panic(FAULT); + } + break; + + case 1: // read the end of declaration + switch (ch) { + case '>': // there is no notation + st = 0; // skip white spaces before a declaration + break; + + case ' ': + case '\n': + case '\r': + case '\t': + // Skip white spaces + break; + + default: + panic(FAULT); + break; + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Parses an entity declaration. This method fills the general ( + * mEnt) and parameter + * ( + * mPEnt) entity look up table. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdent() + throws Exception { + String str = null; + char[] val = null; + Input inp = null; + Pair ids = null; + char ch; + for (short st = 0; st >= 0;) { + ch = getch(); + switch (st) { + case 0: // skip white spaces before entity name + switch (chtyp(ch)) { + case ' ': + // Skip white spaces + break; + + case '%': + // Parameter entity or parameter entity declaration. + ch = getch(); + bkch(); + if (chtyp(ch) == ' ') { + // Parameter entity declaration. + wsskip(); + str = name(false); + switch (chtyp(wsskip())) { + case 'A': + // Read the external identifier + ids = pubsys(' '); + if (wsskip() == '>') { + // External parsed entity + if (mPEnt.containsKey(str) == false) { // [#4.2] + inp = new Input(); + inp.pubid = ids.name; + inp.sysid = ids.value; + mPEnt.put(str, inp); + } + } else { + panic(FAULT); + } + del(ids); + st = -1; // the end of declaration + break; + + case '\"': + case '\'': + // Read the parameter entity value + bqstr('d'); + // Create the parameter entity value + val = new char[mBuffIdx + 1]; + System.arraycopy(mBuff, 1, val, 1, val.length - 1); + // Add surrounding spaces [#4.4.8] + val[0] = ' '; + // Add the entity to the entity look up table + if (mPEnt.containsKey(str) == false) { // [#4.2] + inp = new Input(val); + inp.pubid = mInp.pubid; + inp.sysid = mInp.sysid; + inp.xmlenc = mInp.xmlenc; + inp.xmlver = mInp.xmlver; + mPEnt.put(str, inp); + } + st = -1; // the end of declaration + break; + + default: + panic(FAULT); + break; + } + } else { + // Parameter entity reference. + pent(' '); + } + break; + + default: + bkch(); + str = name(false); + st = 1; // read entity declaration value + break; + } + break; + + case 1: // read entity declaration value + switch (chtyp(ch)) { + case '\"': // internal entity + case '\'': + bkch(); + bqstr('d'); // read a string into the buffer + if (mEnt.get(str) == null) { + // Create general entity value + val = new char[mBuffIdx]; + System.arraycopy(mBuff, 1, val, 0, val.length); + // Add the entity to the entity look up table + if (mEnt.containsKey(str) == false) { // [#4.2] + inp = new Input(val); + inp.pubid = mInp.pubid; + inp.sysid = mInp.sysid; + inp.xmlenc = mInp.xmlenc; + inp.xmlver = mInp.xmlver; + mEnt.put(str, inp); + } + } + st = -1; // the end of declaration + break; + + case 'A': // external entity + bkch(); + ids = pubsys(' '); + switch (wsskip()) { + case '>': // external parsed entity + if (mEnt.containsKey(str) == false) { // [#4.2] + inp = new Input(); + inp.pubid = ids.name; + inp.sysid = ids.value; + mEnt.put(str, inp); + } + break; + + case 'N': // external general unparsed entity + if ("NDATA".equals(name(false)) == true) { + wsskip(); + unparsedEntDecl(str, ids.name, ids.value, name(false)); + break; + } + default: + panic(FAULT); + break; + } + del(ids); + st = -1; // the end of declaration + break; + + case ' ': + // Skip white spaces + break; + + default: + panic(FAULT); + break; + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Parses an element declaration. + * + * This method parses the declaration up to the closing angle bracket. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdelm() + throws Exception { + // This is stub implementation which skips an element + // declaration. + wsskip(); + name(mIsNSAware); + + char ch; + while (true) { + ch = getch(); + switch (ch) { + case '>': + bkch(); + return; + + case EOS: + panic(FAULT); + + default: + break; + } + } + } + + /** + * Parses an attribute list declaration. + * + * This method parses the declaration up to the closing angle bracket. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdattl() + throws Exception { + char elmqn[] = null; + Pair elm = null; + char ch; + for (short st = 0; st >= 0;) { + ch = getch(); + switch (st) { + case 0: // read the element name + switch (chtyp(ch)) { + case 'a': + case 'A': + case '_': + case 'X': + case ':': + bkch(); + // Get the element from the list or add a new one. + elmqn = qname(mIsNSAware); + elm = find(mAttL, elmqn); + if (elm == null) { + elm = pair(mAttL); + elm.chars = elmqn; + mAttL = elm; + } + st = 1; // read an attribute declaration + break; + + case ' ': + break; + + case '%': + pent(' '); + break; + + default: + panic(FAULT); + break; + } + break; + + case 1: // read an attribute declaration + switch (chtyp(ch)) { + case 'a': + case 'A': + case '_': + case 'X': + case ':': + bkch(); + dtdatt(elm); + if (wsskip() == '>') { + return; + } + break; + + case ' ': + break; + + case '%': + pent(' '); + break; + + default: + panic(FAULT); + break; + } + break; + + default: + panic(FAULT); + break; + } + } + } + + /** + * Parses an attribute declaration. + * + * The attribute uses the following fields of Pair object: chars - characters + * of qualified name id - the type identifier of the attribute list - a pair + * which holds the default value (chars field) + * + * @param elm An object which represents all defined attributes on an + * element. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdatt(Pair elm) + throws Exception { + char attqn[] = null; + Pair att = null; + char ch; + for (short st = 0; st >= 0;) { + ch = getch(); + switch (st) { + case 0: // the attribute name + switch (chtyp(ch)) { + case 'a': + case 'A': + case '_': + case 'X': + case ':': + bkch(); + // Get the attribut from the list or add a new one. + attqn = qname(mIsNSAware); + att = find(elm.list, attqn); + if (att == null) { + // New attribute declaration + att = pair(elm.list); + att.chars = attqn; + elm.list = att; + } else { + // Do not override the attribute declaration [#3.3] + att = pair(null); + att.chars = attqn; + att.id = 'c'; + } + wsskip(); + st = 1; + break; + + case '%': + pent(' '); + break; + + case ' ': + break; + + default: + panic(FAULT); + break; + } + break; + + case 1: // the attribute type + switch (chtyp(ch)) { + case '(': + att.id = 'u'; // enumeration type + st = 2; // read the first element of the list + break; + + case '%': + pent(' '); + break; + + case ' ': + break; + + default: + bkch(); + bntok(); // read type id + att.id = bkeyword(); + switch (att.id) { + case 'o': // NOTATION + if (wsskip() != '(') { + panic(FAULT); + } + ch = getch(); + st = 2; // read the first element of the list + break; + + case 'i': // ID + case 'r': // IDREF + case 'R': // IDREFS + case 'n': // ENTITY + case 'N': // ENTITIES + case 't': // NMTOKEN + case 'T': // NMTOKENS + case 'c': // CDATA + wsskip(); + st = 4; // read default declaration + break; + + default: + panic(FAULT); + break; + } + break; + } + break; + + case 2: // read the first element of the list + switch (chtyp(ch)) { + case 'a': + case 'A': + case 'd': + case '.': + case ':': + case '-': + case '_': + case 'X': + bkch(); + switch (att.id) { + case 'u': // enumeration type + bntok(); + break; + + case 'o': // NOTATION + mBuffIdx = -1; + bname(false); + break; + + default: + panic(FAULT); + break; + } + wsskip(); + st = 3; // read next element of the list + break; + + case '%': + pent(' '); + break; + + case ' ': + break; + + default: + panic(FAULT); + break; + } + break; + + case 3: // read next element of the list + switch (ch) { + case ')': + wsskip(); + st = 4; // read default declaration + break; + + case '|': + wsskip(); + switch (att.id) { + case 'u': // enumeration type + bntok(); + break; + + case 'o': // NOTATION + mBuffIdx = -1; + bname(false); + break; + + default: + panic(FAULT); + break; + } + wsskip(); + break; + + case '%': + pent(' '); + break; + + default: + panic(FAULT); + break; + } + break; + + case 4: // read default declaration + switch (ch) { + case '#': + bntok(); + switch (bkeyword()) { + case 'F': // FIXED + switch (wsskip()) { + case '\"': + case '\'': + st = 5; // read the default value + break; + + case EOS: + panic(FAULT); + + default: + st = -1; + break; + } + break; + + case 'Q': // REQUIRED + case 'I': // IMPLIED + st = -1; + break; + + default: + panic(FAULT); + break; + } + break; + + case '\"': + case '\'': + bkch(); + st = 5; // read the default value + break; + + case ' ': + case '\n': + case '\r': + case '\t': + break; + + case '%': + pent(' '); + break; + + default: + bkch(); + st = -1; + break; + } + break; + + case 5: // read the default value + switch (ch) { + case '\"': + case '\'': + bkch(); + bqstr('d'); // the value in the mBuff now + att.list = pair(null); + // Create a string like "attqname='value' " + att.list.chars = new char[att.chars.length + mBuffIdx + 3]; + System.arraycopy( + att.chars, 1, att.list.chars, 0, att.chars.length - 1); + att.list.chars[att.chars.length - 1] = '='; + att.list.chars[att.chars.length] = ch; + System.arraycopy( + mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); + att.list.chars[att.chars.length + mBuffIdx + 1] = ch; + att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; + st = -1; + break; + + default: + panic(FAULT); + break; + } + break; + + default: + panic(FAULT); + break; + } + } + } + + /** + * Parses a notation declaration. + * + * This method parses the declaration up to the closing angle bracket. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void dtdnot() + throws Exception { + wsskip(); + String name = name(false); + wsskip(); + Pair ids = pubsys('N'); + notDecl(name, ids.name, ids.value); + del(ids); + } + + /** + * Parses an attribute. + * + * This recursive method is responsible for prefix addition + * ( + * mPref) on the way down. The element's start tag end triggers + * the return process. The method then on it's way back resolves prefixes + * and accumulates attributes. + * + *

att.num carries attribute flags where: 0x1 - attribute is + * declared in DTD (attribute decalration had been read); 0x2 - attribute's + * default value is used.

+ * + * @param att An object which reprecents current attribute. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void attr(Pair att) + throws Exception { + switch (wsskip()) { + case '/': + case '>': + if ((att.num & 0x2) == 0) { // all attributes have been read + att.num |= 0x2; // set default attribute flag + Input inp = mInp; + // Go through all attributes defined on current element. + for (Pair def = mElm.list; def != null; def = def.next) { + if (def.list == null) // no default value + { + continue; + } + // Go through all attributes defined on current + // element and add defaults. + Pair act = find(att.next, def.chars); + if (act == null) { + push(new Input(def.list.chars)); + } + } + if (mInp != inp) { // defaults have been added + attr(att); + return; + } + } + // Ensure the attribute string array capacity + mAttrs.setLength(mAttrIdx); + mItems = mAttrs.mItems; + return; + + case EOS: + panic(FAULT); + + default: + // Read the attribute name and value + att.chars = qname(mIsNSAware); + att.name = att.local(); + String type = atype(att); // sets attribute's type on att.id + wsskip(); + if (getch() != '=') { + panic(FAULT); + } + bqstr((char) att.id); // read the value with normalization. + String val = new String(mBuff, 1, mBuffIdx); + Pair next = pair(att); + next.num = (att.num & ~0x1); // inherit attribute flags + // Put a namespace declaration on top of the prefix stack + if ((mIsNSAware == false) || (isdecl(att, val) == false)) { + // An ordinary attribute + mAttrIdx++; + attr(next); // recursive call to parse the next attribute + mAttrIdx--; + // Add the attribute to the attributes string array + char idx = (char) (mAttrIdx << 3); + mItems[idx + 1] = att.qname(); // attr qname + mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name + mItems[idx + 3] = val; // attr value + mItems[idx + 4] = type; // attr type + switch (att.num & 0x3) { + case 0x0: + mItems[idx + 5] = null; + break; + + case 0x1: // declared attribute + mItems[idx + 5] = "d"; + break; + + default: // 0x2, 0x3 - default attribute always declared + mItems[idx + 5] = "D"; + break; + } + // Resolve the prefix if any and report the attribute + // NOTE: The attribute does not accept the default namespace. + mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; + } else { + // A namespace declaration. mPref.name contains prefix and + // mPref.value contains namespace URI set by isdecl method. + // Report a start of the new mapping + newPrefix(); + // Recursive call to parse the next attribute + attr(next); + // NOTE: The namespace declaration is not reported. + } + del(next); + break; + } + } + + /** + * Retrieves attribute type. + * + * This method sets the type of normalization in the attribute + * id field and returns the name of attribute type. + * + * @param att An object which represents current attribute. + * @return The name of the attribute type. + * @exception Exception is parser specific exception form panic method. + */ + private String atype(Pair att) + throws Exception { + Pair attr; + + // CDATA-type normalization by default [#3.3.3] + att.id = 'c'; + if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { + return "CDATA"; + } + + att.num |= 0x1; // attribute is declared + + // Non-CDATA normalization except when the attribute type is CDATA. + att.id = 'i'; + switch (attr.id) { + case 'i': + return "ID"; + + case 'r': + return "IDREF"; + + case 'R': + return "IDREFS"; + + case 'n': + return "ENTITY"; + + case 'N': + return "ENTITIES"; + + case 't': + return "NMTOKEN"; + + case 'T': + return "NMTOKENS"; + + case 'u': + return "NMTOKEN"; + + case 'o': + return "NOTATION"; + + case 'c': + att.id = 'c'; + return "CDATA"; + + default: + panic(FAULT); + } + return null; + } + + /** + * Parses a comment. + * + * The '<!' part is read in dispatcher so the method starts + * with first '-' after '<!'. + * + * @exception Exception is parser specific exception form panic method. + */ + private void comm() + throws Exception { + if (mPh == PH_DOC_START) { + mPh = PH_MISC_DTD; // misc before DTD + } // '= 0;) { + ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); + if (ch == EOS) { + panic(FAULT); + } + switch (st) { + case 0: // first '-' of the comment open + if (ch == '-') { + st = 1; + } else { + panic(FAULT); + } + break; + + case 1: // secind '-' of the comment open + if (ch == '-') { + st = 2; + } else { + panic(FAULT); + } + break; + + case 2: // skip the comment body + switch (ch) { + case '-': + st = 3; + break; + + default: + bappend(ch); + break; + } + break; + + case 3: // second '-' of the comment close + switch (ch) { + case '-': + st = 4; + break; + + default: + bappend('-'); + bappend(ch); + st = 2; + break; + } + break; + + case 4: // '>' of the comment close + if (ch == '>') { + comm(mBuff, mBuffIdx + 1); + st = -1; + break; + } + // else - panic [#2.5 compatibility note] + + default: + panic(FAULT); + } + } + } + + /** + * Parses a processing instruction. + * + * The '<?' is read in dispatcher so the method starts with + * first character of PI target name after '<?'. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void pi() + throws Exception { + // '= 0;) { + ch = getch(); + if (ch == EOS) { + panic(FAULT); + } + switch (st) { + case 0: // read the PI target name + switch (chtyp(ch)) { + case 'a': + case 'A': + case '_': + case ':': + case 'X': + bkch(); + str = name(false); + // PI target name may not be empty string [#2.6] + // PI target name 'XML' is reserved [#2.6] + if ((str.length() == 0) + || (mXml.name.equals(str.toLowerCase()) == true)) { + panic(FAULT); + } + // This is processing instruction + if (mPh == PH_DOC_START) // the begining of the document + { + mPh = PH_MISC_DTD; // misc before DTD + } + wsskip(); // skip spaces after the PI target name + st = 1; // accumulate the PI body + mBuffIdx = -1; + break; + + default: + panic(FAULT); + } + break; + + case 1: // accumulate the PI body + switch (ch) { + case '?': + st = 2; // end of the PI body + break; + + default: + bappend(ch); + break; + } + break; + + case 2: // end of the PI body + switch (ch) { + case '>': + // PI has been read. + pi(str, new String(mBuff, 0, mBuffIdx + 1)); + st = -1; + break; + + case '?': + bappend('?'); + break; + + default: + bappend('?'); + bappend(ch); + st = 1; // accumulate the PI body + break; + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Parses a character data. + * + * The '<!' part is read in dispatcher so the method starts + * with first '[' after '<!'. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void cdat() + throws Exception { + // '= 0;) { + ch = getch(); + switch (st) { + case 0: // the first '[' of the CDATA open + if (ch == '[') { + st = 1; + } else { + panic(FAULT); + } + break; + + case 1: // read "CDATA" + if (chtyp(ch) == 'A') { + bappend(ch); + } else { + if ("CDATA".equals( + new String(mBuff, 0, mBuffIdx + 1)) != true) { + panic(FAULT); + } + bkch(); + st = 2; + } + break; + + case 2: // the second '[' of the CDATA open + if (ch != '[') { + panic(FAULT); + } + mBuffIdx = -1; + st = 3; + break; + + case 3: // read data before the first ']' + if (ch != ']') { + bappend(ch); + } else { + st = 4; + } + break; + + case 4: // read the second ']' or continue to read the data + if (ch != ']') { + bappend(']'); + bappend(ch); + st = 3; + } else { + st = 5; + } + break; + + case 5: // read '>' or continue to read the data + switch (ch) { + case ']': + bappend(']'); + break; + + case '>': + bflash(); + st = -1; + break; + + default: + bappend(']'); + bappend(']'); + bappend(ch); + st = 3; + break; + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Reads a xml name. + * + * The xml name must conform "Namespaces in XML" specification. Therefore + * the ':' character is not allowed in the name. This method should be used + * for PI and entity names which may not have a namespace according to the + * specification mentioned above. + * + * @param ns The true value turns namespace conformance on. + * @return The name has been read. + * @exception Exception When incorrect character appear in the name. + * @exception IOException + */ + protected String name(boolean ns) + throws Exception { + mBuffIdx = -1; + bname(ns); + return new String(mBuff, 1, mBuffIdx); + } + + /** + * Reads a qualified xml name. + * + * The characters of a qualified name is an array of characters. The first + * (chars[0]) character is the index of the colon character which separates + * the prefix from the local name. If the index is zero, the name does not + * contain separator or the parser works in the namespace unaware mode. The + * length of qualified name is the length of the array minus one. + * + * @param ns The true value turns namespace conformance on. + * @return The characters of a qualified name. + * @exception Exception When incorrect character appear in the name. + * @exception IOException + */ + protected char[] qname(boolean ns) + throws Exception { + mBuffIdx = -1; + bname(ns); + char chars[] = new char[mBuffIdx + 1]; + System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); + return chars; + } + + /** + * Reads the public or/and system identifiers. + * + * @param inp The input object. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void pubsys(Input inp) + throws Exception { + Pair pair = pubsys(' '); + inp.pubid = pair.name; + inp.sysid = pair.value; + del(pair); + } + + /** + * Reads the public or/and system identifiers. + * + * @param flag The 'N' allows public id be without system id. + * @return The public or/and system identifiers pair. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private Pair pubsys(char flag) + throws Exception { + Pair ids = pair(null); + String str = name(false); + if ("PUBLIC".equals(str) == true) { + bqstr('i'); // non-CDATA normalization [#4.2.2] + ids.name = new String(mBuff, 1, mBuffIdx); + switch (wsskip()) { + case '\"': + case '\'': + bqstr(' '); + ids.value = new String(mBuff, 1, mBuffIdx); + break; + + case EOS: + panic(FAULT); + + default: + if (flag != 'N') // [#4.7] + { + panic(FAULT); + } + ids.value = null; + break; + } + return ids; + } else if ("SYSTEM".equals(str) == true) { + ids.name = null; + bqstr(' '); + ids.value = new String(mBuff, 1, mBuffIdx); + return ids; + } + panic(FAULT); + return null; + } + + /** + * Reads an attribute value. + * + * The grammar which this method can read is:
+ * eqstr := S "=" qstr
+ * qstr := S ("'" string "'") | + * ('"' string '"')
This method resolves entities + * inside a string unless the parser parses DTD. + * + * @param flag The '=' character forces the method to accept the '=' + * character before quoted string and read the following string as not an + * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; + * '-' - not an attribute value; 'd' - in DTD context. + * @return The content of the quoted strign as a string. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + protected String eqstr(char flag) + throws Exception { + if (flag == '=') { + wsskip(); + if (getch() != '=') { + panic(FAULT); + } + } + bqstr((flag == '=') ? '-' : flag); + return new String(mBuff, 1, mBuffIdx); + } + + /** + * Resoves an entity. + * + * This method resolves built-in and character entity references. It is also + * reports external entities to the application. + * + * @param flag The 'x' character forces the method to report a skipped + * entity; 'i' character - indicates non-CDATA normalization. + * @return Name of unresolved entity or null if entity had been + * resolved successfully. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private String ent(char flag) + throws Exception { + char ch; + int idx = mBuffIdx + 1; + Input inp = null; + String str = null; + mESt = 0x100; // reset the built-in entity recognizer + bappend('&'); + for (short st = 0; st >= 0;) { + ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); + switch (st) { + case 0: // the first character of the entity name + case 1: // read built-in entity name + switch (chtyp(ch)) { + case 'd': + case '.': + case '-': + if (st != 1) { + panic(FAULT); + } + case 'a': + case 'A': + case '_': + case 'X': + bappend(ch); + eappend(ch); + st = 1; + break; + + case ':': + if (mIsNSAware != false) { + panic(FAULT); + } + bappend(ch); + eappend(ch); + st = 1; + break; + + case ';': + if (mESt < 0x100) { + // The entity is a built-in entity + mBuffIdx = idx - 1; + bappend(mESt); + st = -1; + break; + } else if (mPh == PH_DTD) { + // In DTD entity declaration has to resolve character + // entities and include "as is" others. [#4.4.7] + bappend(';'); + st = -1; + break; + } + // Convert an entity name to a string + str = new String(mBuff, idx + 1, mBuffIdx - idx); + inp = (Input) mEnt.get(str); + // Restore the buffer offset + mBuffIdx = idx - 1; + if (inp != null) { + if (inp.chars == null) { + // External entity + InputSource is = resolveEnt(str, inp.pubid, inp.sysid); + if (is != null) { + push(new Input(BUFFSIZE_READER)); + setinp(is); + mInp.pubid = inp.pubid; + mInp.sysid = inp.sysid; + str = null; // the entity is resolved + } else { + // Unresolved external entity + if (flag != 'x') { + panic(FAULT); // unknown entity within marckup + } // str is name of unresolved entity + } + } else { + // Internal entity + push(inp); + str = null; // the entity is resolved + } + } else { + // Unknown or general unparsed entity + if (flag != 'x') { + panic(FAULT); // unknown entity within marckup + } // str is name of unresolved entity + } + st = -1; + break; + + case '#': + if (st != 0) { + panic(FAULT); + } + st = 2; + break; + + default: + panic(FAULT); + } + break; + + case 2: // read character entity + switch (chtyp(ch)) { + case 'd': + bappend(ch); + break; + + case ';': + // Convert the character entity to a character + try { + int i = Integer.parseInt( + new String(mBuff, idx + 1, mBuffIdx - idx), 10); + if (i >= 0xffff) { + panic(FAULT); + } + ch = (char) i; + } catch (NumberFormatException nfe) { + panic(FAULT); + } + // Restore the buffer offset + mBuffIdx = idx - 1; + if (ch == ' ' || mInp.next != null) { + bappend(ch, flag); + } else { + bappend(ch); + } + st = -1; + break; + + case 'a': + // If the entity buffer is empty and ch == 'x' + if ((mBuffIdx == idx) && (ch == 'x')) { + st = 3; + break; + } + default: + panic(FAULT); + } + break; + + case 3: // read hex character entity + switch (chtyp(ch)) { + case 'A': + case 'a': + case 'd': + bappend(ch); + break; + + case ';': + // Convert the character entity to a character + try { + int i = Integer.parseInt( + new String(mBuff, idx + 1, mBuffIdx - idx), 16); + if (i >= 0xffff) { + panic(FAULT); + } + ch = (char) i; + } catch (NumberFormatException nfe) { + panic(FAULT); + } + // Restore the buffer offset + mBuffIdx = idx - 1; + if (ch == ' ' || mInp.next != null) { + bappend(ch, flag); + } else { + bappend(ch); + } + st = -1; + break; + + default: + panic(FAULT); + } + break; + + default: + panic(FAULT); + } + } + + return str; + } + + /** + * Resoves a parameter entity. + * + * This method resolves a parameter entity references. It is also reports + * external entities to the application. + * + * @param flag The '-' instruct the method to do not set up surrounding + * spaces [#4.4.8]. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void pent(char flag) + throws Exception { + char ch; + int idx = mBuffIdx + 1; + Input inp = null; + String str = null; + bappend('%'); + if (mPh != PH_DTD) // the DTD internal subset + { + return; // Not Recognized [#4.4.1] + } // Read entity name + bname(false); + str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); + if (getch() != ';') { + panic(FAULT); + } + inp = (Input) mPEnt.get(str); + // Restore the buffer offset + mBuffIdx = idx - 1; + if (inp != null) { + if (inp.chars == null) { + // External parameter entity + InputSource is = resolveEnt(str, inp.pubid, inp.sysid); + if (is != null) { + if (flag != '-') { + bappend(' '); // tail space + } + push(new Input(BUFFSIZE_READER)); + // BUG: there is no leading space! [#4.4.8] + setinp(is); + mInp.pubid = inp.pubid; + mInp.sysid = inp.sysid; + } else { + // Unresolved external parameter entity + skippedEnt("%" + str); + } + } else { + // Internal parameter entity + if (flag == '-') { + // No surrounding spaces + inp.chIdx = 1; + } else { + // Insert surrounding spaces + bappend(' '); // tail space + inp.chIdx = 0; + } + push(inp); + } + } else { + // Unknown parameter entity + skippedEnt("%" + str); + } + } + + /** + * Recognizes and handles a namespace declaration. + * + * This method identifies a type of namespace declaration if any and puts + * new mapping on top of prefix stack. + * + * @param name The attribute qualified name (name.value is a + * String object which represents the attribute prefix). + * @param value The attribute value. + * @return true if a namespace declaration is recognized. + */ + private boolean isdecl(Pair name, String value) { + if (name.chars[0] == 0) { + if ("xmlns".equals(name.name) == true) { + // New default namespace declaration + mPref = pair(mPref); + mPref.list = mElm; // prefix owner element + mPref.value = value; + mPref.name = ""; + mPref.chars = NONS; + mElm.num++; // namespace counter + return true; + } + } else { + if (name.eqpref(XMLNS) == true) { + // New prefix declaration + int len = name.name.length(); + mPref = pair(mPref); + mPref.list = mElm; // prefix owner element + mPref.value = value; + mPref.name = name.name; + mPref.chars = new char[len + 1]; + mPref.chars[0] = (char) (len + 1); + name.name.getChars(0, len, mPref.chars, 1); + mElm.num++; // namespace counter + return true; + } + } + return false; + } + + /** + * Resolves a prefix. + * + * @return The namespace assigned to the prefix. + * @exception Exception When mapping for specified prefix is not found. + */ + private String rslv(char[] qname) + throws Exception { + for (Pair pref = mPref; pref != null; pref = pref.next) { + if (pref.eqpref(qname) == true) { + return pref.value; + } + } + if (qname[0] == 1) { // QNames like ':local' + for (Pair pref = mPref; pref != null; pref = pref.next) { + if (pref.chars[0] == 0) { + return pref.value; + } + } + } + panic(FAULT); + return null; + } + + /** + * Skips xml white space characters. + * + * This method skips white space characters (' ', '\t', '\n', '\r') and + * looks ahead not white space character. + * + * @return The first not white space look ahead character. + * @exception IOException + */ + protected char wsskip() + throws IOException { + char ch; + while (true) { + // Read next character + ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); + if (ch < 0x80) { + if (nmttyp[ch] != 3) // [ \t\n\r] + { + break; + } + } else { + break; + } + } + mChIdx--; // bkch(); + return ch; + } + + /** + * Reports document type. + * + * @param name The name of the entity. + * @param pubid The public identifier of the entity or null. + * @param sysid The system identifier of the entity or null. + */ + protected abstract void docType(String name, String pubid, String sysid) + throws SAXException; + + /** + * Reports a comment. + * + * @param text The comment text starting from first charcater. + * @param length The number of characters in comment. + */ + protected abstract void comm(char[] text, int length); + + /** + * Reports a processing instruction. + * + * @param target The processing instruction target name. + * @param body The processing instruction body text. + */ + protected abstract void pi(String target, String body) + throws Exception; + + /** + * Reports new namespace prefix. The Namespace prefix ( + * mPref.name) being declared and the Namespace URI ( + * mPref.value) the prefix is mapped to. An empty string is + * used for the default element namespace, which has no prefix. + */ + protected abstract void newPrefix() + throws Exception; + + /** + * Reports skipped entity name. + * + * @param name The entity name. + */ + protected abstract void skippedEnt(String name) + throws Exception; + + /** + * Returns an + * InputSource for specified entity or + * null. + * + * @param name The name of the entity. + * @param pubid The public identifier of the entity. + * @param sysid The system identifier of the entity. + */ + protected abstract InputSource resolveEnt( + String name, String pubid, String sysid) + throws Exception; + + /** + * Reports notation declaration. + * + * @param name The notation's name. + * @param pubid The notation's public identifier, or null if none was given. + * @param sysid The notation's system identifier, or null if none was given. + */ + protected abstract void notDecl(String name, String pubid, String sysid) + throws Exception; + + /** + * Reports unparsed entity name. + * + * @param name The unparsed entity's name. + * @param pubid The entity's public identifier, or null if none was given. + * @param sysid The entity's system identifier. + * @param notation The name of the associated notation. + */ + protected abstract void unparsedEntDecl( + String name, String pubid, String sysid, String notation) + throws Exception; + + /** + * Notifies the handler about fatal parsing error. + * + * @param msg The problem description message. + */ + protected abstract void panic(String msg) + throws Exception; + + /** + * Reads a qualified xml name. + * + * This is low level routine which leaves a qName in the buffer. The + * characters of a qualified name is an array of characters. The first + * (chars[0]) character is the index of the colon character which separates + * the prefix from the local name. If the index is zero, the name does not + * contain separator or the parser works in the namespace unaware mode. The + * length of qualified name is the length of the array minus one. + * + * @param ns The true value turns namespace conformance on. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void bname(boolean ns) + throws Exception { + char ch; + char type; + mBuffIdx++; // allocate a char for colon offset + int bqname = mBuffIdx; + int bcolon = bqname; + int bchidx = bqname + 1; + int bstart = bchidx; + int cstart = mChIdx; + short st = (short) ((ns == true) ? 0 : 2); + while (true) { + // Read next character + if (mChIdx >= mChLen) { + bcopy(cstart, bstart); + getch(); + mChIdx--; // bkch(); + cstart = mChIdx; + bstart = bchidx; + } + ch = mChars[mChIdx++]; + type = (char) 0; // [X] + if (ch < 0x80) { + type = (char) nmttyp[ch]; + } else if (ch == EOS) { + panic(FAULT); + } + // Parse QName + switch (st) { + case 0: // read the first char of the prefix + case 2: // read the first char of the suffix + switch (type) { + case 0: // [aA_X] + bchidx++; // append char to the buffer + st++; // (st == 0)? 1: 3; + break; + + case 1: // [:] + mChIdx--; // bkch(); + st++; // (st == 0)? 1: 3; + break; + + default: + panic(FAULT); + } + break; + + case 1: // read the prefix + case 3: // read the suffix + switch (type) { + case 0: // [aA_X] + case 2: // [.-d] + bchidx++; // append char to the buffer + break; + + case 1: // [:] + bchidx++; // append char to the buffer + if (ns == true) { + if (bcolon != bqname) { + panic(FAULT); // it must be only one colon + } + bcolon = bchidx - 1; + if (st == 1) { + st = 2; + } + } + break; + + default: + mChIdx--; // bkch(); + bcopy(cstart, bstart); + mBuff[bqname] = (char) (bcolon - bqname); + return; + } + break; + + default: + panic(FAULT); + } + } + } + + /** + * Reads a nmtoken. + * + * This is low level routine which leaves a nmtoken in the buffer. + * + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void bntok() + throws Exception { + char ch; + mBuffIdx = -1; + bappend((char) 0); // default offset to the colon char + while (true) { + ch = getch(); + switch (chtyp(ch)) { + case 'a': + case 'A': + case 'd': + case '.': + case ':': + case '-': + case '_': + case 'X': + bappend(ch); + break; + + case 'Z': + panic(FAULT); + + default: + bkch(); + return; + } + } + } + + /** + * Recognizes a keyword. + * + * This is low level routine which recognizes one of keywords in the buffer. + * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - + * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - + * Q IMPLIED - I FIXED - F + * + * @return an id of a keyword or '?'. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private char bkeyword() + throws Exception { + String str = new String(mBuff, 1, mBuffIdx); + switch (str.length()) { + case 2: // ID + return ("ID".equals(str) == true) ? 'i' : '?'; + + case 5: // IDREF, CDATA, FIXED + switch (mBuff[1]) { + case 'I': + return ("IDREF".equals(str) == true) ? 'r' : '?'; + case 'C': + return ("CDATA".equals(str) == true) ? 'c' : '?'; + case 'F': + return ("FIXED".equals(str) == true) ? 'F' : '?'; + default: + break; + } + break; + + case 6: // IDREFS, ENTITY + switch (mBuff[1]) { + case 'I': + return ("IDREFS".equals(str) == true) ? 'R' : '?'; + case 'E': + return ("ENTITY".equals(str) == true) ? 'n' : '?'; + default: + break; + } + break; + + case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT + switch (mBuff[1]) { + case 'I': + return ("IMPLIED".equals(str) == true) ? 'I' : '?'; + case 'N': + return ("NMTOKEN".equals(str) == true) ? 't' : '?'; + case 'A': + return ("ATTLIST".equals(str) == true) ? 'a' : '?'; + case 'E': + return ("ELEMENT".equals(str) == true) ? 'e' : '?'; + default: + break; + } + break; + + case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED + switch (mBuff[2]) { + case 'N': + return ("ENTITIES".equals(str) == true) ? 'N' : '?'; + case 'M': + return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; + case 'O': + return ("NOTATION".equals(str) == true) ? 'o' : '?'; + case 'E': + return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; + default: + break; + } + break; + + default: + break; + } + return '?'; + } + + /** + * Reads a single or double quotted string in to the buffer. + * + * This method resolves entities inside a string unless the parser parses + * DTD. + * + * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - + * not an attribute value; 'd' - in DTD context. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private void bqstr(char flag) + throws Exception { + Input inp = mInp; // remember the original input + mBuffIdx = -1; + bappend((char) 0); // default offset to the colon char + char ch; + for (short st = 0; st >= 0;) { + ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); + switch (st) { + case 0: // read a single or double quote + switch (ch) { + case ' ': + case '\n': + case '\r': + case '\t': + break; + + case '\'': + st = 2; // read a single quoted string + break; + + case '\"': + st = 3; // read a double quoted string + break; + + default: + panic(FAULT); + break; + } + break; + + case 2: // read a single quoted string + case 3: // read a double quoted string + switch (ch) { + case '\'': + if ((st == 2) && (mInp == inp)) { + st = -1; + } else { + bappend(ch); + } + break; + + case '\"': + if ((st == 3) && (mInp == inp)) { + st = -1; + } else { + bappend(ch); + } + break; + + case '&': + if (flag != 'd') { + ent(flag); + } else { + bappend(ch); + } + break; + + case '%': + if (flag == 'd') { + pent('-'); + } else { + bappend(ch); + } + break; + + case '<': + if ((flag == '-') || (flag == 'd')) { + bappend(ch); + } else { + panic(FAULT); + } + break; + + case EOS: // EOS before single/double quote + panic(FAULT); + + case '\r': // EOL processing [#2.11 & #3.3.3] + if (flag != ' ' && mInp.next == null) { + if (getch() != '\n') { + bkch(); + } + ch = '\n'; + } + default: + bappend(ch, flag); + break; + } + break; + + default: + panic(FAULT); + } + } + // There is maximum one space at the end of the string in + // i-mode (non CDATA normalization) and it has to be removed. + if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { + mBuffIdx -= 1; + } + } + + /** + * Reports characters and empties the parser's buffer. This method is called + * only if parser is going to return control to the main loop. This means + * that this method may use parser buffer to report white space without + * copeing characters to temporary buffer. + */ + protected abstract void bflash() + throws Exception; + + /** + * Reports white space characters and empties the parser's buffer. This + * method is called only if parser is going to return control to the main + * loop. This means that this method may use parser buffer to report white + * space without copeing characters to temporary buffer. + */ + protected abstract void bflash_ws() + throws Exception; + + /** + * Appends a character to parser's buffer with normalization. + * + * @param ch The character to append to the buffer. + * @param mode The normalization mode. + */ + private void bappend(char ch, char mode) { + // This implements attribute value normalization as + // described in the XML specification [#3.3.3]. + switch (mode) { + case 'i': // non CDATA normalization + switch (ch) { + case ' ': + case '\n': + case '\r': + case '\t': + if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { + bappend(' '); + } + return; + + default: + break; + } + break; + + case 'c': // CDATA normalization + switch (ch) { + case '\n': + case '\r': + case '\t': + ch = ' '; + break; + + default: + break; + } + break; + + default: // no normalization + break; + } + mBuffIdx++; + if (mBuffIdx < mBuff.length) { + mBuff[mBuffIdx] = ch; + } else { + mBuffIdx--; + bappend(ch); + } + } + + /** + * Appends a character to parser's buffer. + * + * @param ch The character to append to the buffer. + */ + private void bappend(char ch) { + try { + mBuff[++mBuffIdx] = ch; + } catch (Exception exp) { + // Double the buffer size + char buff[] = new char[mBuff.length << 1]; + System.arraycopy(mBuff, 0, buff, 0, mBuff.length); + mBuff = buff; + mBuff[mBuffIdx] = ch; + } + } + + /** + * Appends (mChIdx - cidx) characters from character buffer (mChars) to + * parser's buffer (mBuff). + * + * @param cidx The character buffer (mChars) start index. + * @param bidx The parser buffer (mBuff) start index. + */ + private void bcopy(int cidx, int bidx) { + int length = mChIdx - cidx; + if ((bidx + length + 1) >= mBuff.length) { + // Expand the buffer + char buff[] = new char[mBuff.length + length]; + System.arraycopy(mBuff, 0, buff, 0, mBuff.length); + mBuff = buff; + } + System.arraycopy(mChars, cidx, mBuff, bidx, length); + mBuffIdx += length; + } + + /** + * Recognizes the built-in entities lt, gt, amp, + * apos, quot. The initial state is 0x100. Any state belowe + * 0x100 is a built-in entity replacement character. + * + * @param ch the next character of an entity name. + */ + private void eappend(char ch) { + switch (mESt) { + case 0x100: // "l" or "g" or "a" or "q" + switch (ch) { + case 'l': + mESt = 0x101; + break; + case 'g': + mESt = 0x102; + break; + case 'a': + mESt = 0x103; + break; + case 'q': + mESt = 0x107; + break; + default: + mESt = 0x200; + break; + } + break; + + case 0x101: // "lt" + mESt = (ch == 't') ? '<' : (char) 0x200; + break; + + case 0x102: // "gt" + mESt = (ch == 't') ? '>' : (char) 0x200; + break; + + case 0x103: // "am" or "ap" + switch (ch) { + case 'm': + mESt = 0x104; + break; + case 'p': + mESt = 0x105; + break; + default: + mESt = 0x200; + break; + } + break; + + case 0x104: // "amp" + mESt = (ch == 'p') ? '&' : (char) 0x200; + break; + + case 0x105: // "apo" + mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; + break; + + case 0x106: // "apos" + mESt = (ch == 's') ? '\'' : (char) 0x200; + break; + + case 0x107: // "qu" + mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; + break; + + case 0x108: // "quo" + mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; + break; + + case 0x109: // "quot" + mESt = (ch == 't') ? '\"' : (char) 0x200; + break; + + case '<': // "lt" + case '>': // "gt" + case '&': // "amp" + case '\'': // "apos" + case '\"': // "quot" + mESt = 0x200; + default: + break; + } + } + + /** + * Sets up a new input source on the top of the input stack. Note, the first + * byte returned by the entity's byte stream has to be the first byte in the + * entity. However, the parser does not expect the byte order mask in both + * cases when encoding is provided by the input source. + * + * @param is A new input source to set up. + * @exception IOException If any IO errors occur. + * @exception Exception is parser specific exception form panic method. + */ + protected void setinp(InputSource is) + throws Exception { + Reader reader = null; + mChIdx = 0; + mChLen = 0; + mChars = mInp.chars; + mInp.src = null; + if (mPh < PH_DOC_START) { + mIsSAlone = false; // default [#2.9] + } + mIsSAloneSet = false; + if (is.getCharacterStream() != null) { + // Ignore encoding in the xml text decl. + reader = is.getCharacterStream(); + xml(reader); + } else if (is.getByteStream() != null) { + String expenc; + if (is.getEncoding() != null) { + // Ignore encoding in the xml text decl. + expenc = is.getEncoding().toUpperCase(); + if (expenc.equals("UTF-16")) { + reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] + } else { + reader = enc(expenc, is.getByteStream()); + } + xml(reader); + } else { + // Get encoding from BOM or the xml text decl. + reader = bom(is.getByteStream(), ' '); + if (reader == null) { + // Encoding is defined by the xml text decl. + reader = enc("UTF-8", is.getByteStream()); + expenc = xml(reader); + if (expenc.startsWith("UTF-16")) { + panic(FAULT); // UTF-16 must have BOM [#4.3.3] + } + reader = enc(expenc, is.getByteStream()); + } else { + // Encoding is defined by the BOM. + xml(reader); + } + } + } else { + // There is no support for public/system identifiers. + panic(FAULT); + } + mInp.src = reader; + mInp.pubid = is.getPublicId(); + mInp.sysid = is.getSystemId(); + } + + /** + * Determines the entity encoding. + * + * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the + * first byte returned by the entity's byte stream has to be the first byte + * in the entity. Also, there is no support for UCS-4. + * + * @param is A byte stream of the entity. + * @param hint An encoding hint, character U means UTF-16. + * @return a reader constructed from the BOM or UTF-8 by default. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private Reader bom(InputStream is, char hint) + throws Exception { + int val = is.read(); + switch (val) { + case 0xef: // UTF-8 + if (hint == 'U') // must be UTF-16 + { + panic(FAULT); + } + if (is.read() != 0xbb) { + panic(FAULT); + } + if (is.read() != 0xbf) { + panic(FAULT); + } + return new ReaderUTF8(is); + + case 0xfe: // UTF-16, big-endian + if (is.read() != 0xff) { + panic(FAULT); + } + return new ReaderUTF16(is, 'b'); + + case 0xff: // UTF-16, little-endian + if (is.read() != 0xfe) { + panic(FAULT); + } + return new ReaderUTF16(is, 'l'); + + case -1: + mChars[mChIdx++] = EOS; + return new ReaderUTF8(is); + + default: + if (hint == 'U') // must be UTF-16 + { + panic(FAULT); + } + // Read the rest of UTF-8 character + switch (val & 0xf0) { + case 0xc0: + case 0xd0: + mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); + break; + + case 0xe0: + mChars[mChIdx++] = (char) (((val & 0x0f) << 12) + | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); + break; + + case 0xf0: // UCS-4 character + throw new UnsupportedEncodingException(); + + default: + mChars[mChIdx++] = (char) val; + break; + } + return null; + } + } + + /** + * Parses the xml text declaration. + * + * This method gets encoding from the xml text declaration [#4.3.1] if any. + * The method assumes the buffer (mChars) is big enough to accomodate whole + * xml text declaration. + * + * @param reader is entity reader. + * @return The xml text declaration encoding or default UTF-8 encoding. + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private String xml(Reader reader) + throws Exception { + String str = null; + String enc = "UTF-8"; + char ch; + int val; + short st; + // Read the xml text declaration into the buffer + if (mChIdx != 0) { + // The bom method have read ONE char into the buffer. + st = (short) ((mChars[0] == '<') ? 1 : -1); + } else { + st = 0; + } + while (st >= 0 && mChIdx < mChars.length) { + ch = ((val = reader.read()) >= 0) ? (char) val : EOS; + mChars[mChIdx++] = ch; + switch (st) { + case 0: // read '<' of xml declaration + switch (ch) { + case '<': + st = 1; + break; + + case 0xfeff: // the byte order mask + ch = ((val = reader.read()) >= 0) ? (char) val : EOS; + mChars[mChIdx - 1] = ch; + st = (short) ((ch == '<') ? 1 : -1); + break; + + default: + st = -1; + break; + } + break; + + case 1: // read '?' of xml declaration [#4.3.1] + st = (short) ((ch == '?') ? 2 : -1); + break; + + case 2: // read 'x' of xml declaration [#4.3.1] + st = (short) ((ch == 'x') ? 3 : -1); + break; + + case 3: // read 'm' of xml declaration [#4.3.1] + st = (short) ((ch == 'm') ? 4 : -1); + break; + + case 4: // read 'l' of xml declaration [#4.3.1] + st = (short) ((ch == 'l') ? 5 : -1); + break; + + case 5: // read white space after 'xml' + switch (ch) { + case ' ': + case '\t': + case '\r': + case '\n': + st = 6; + break; + + default: + st = -1; + break; + } + break; + + case 6: // read content of xml declaration + switch (ch) { + case '?': + st = 7; + break; + + case EOS: + st = -2; + break; + + default: + break; + } + break; + + case 7: // read '>' after '?' of xml declaration + switch (ch) { + case '>': + case EOS: + st = -2; + break; + + default: + st = 6; + break; + } + break; + + default: + panic(FAULT); + break; + } + } + mChLen = mChIdx; + mChIdx = 0; + // If there is no xml text declaration, the encoding is default. + if (st == -1) { + return enc; + } + mChIdx = 5; // the first white space after "= 0;) { + ch = getch(); + switch (st) { + case 0: // skip spaces after the xml declaration name + if (chtyp(ch) != ' ') { + bkch(); + st = 1; + } + break; + + case 1: // read xml declaration version + case 2: // read xml declaration encoding or standalone + case 3: // read xml declaration standalone + switch (chtyp(ch)) { + case 'a': + case 'A': + case '_': + bkch(); + str = name(false).toLowerCase(); + if ("version".equals(str) == true) { + if (st != 1) { + panic(FAULT); + } + if ("1.0".equals(eqstr('=')) != true) { + panic(FAULT); + } + mInp.xmlver = 0x0100; + st = 2; + } else if ("encoding".equals(str) == true) { + if (st != 2) { + panic(FAULT); + } + mInp.xmlenc = eqstr('=').toUpperCase(); + enc = mInp.xmlenc; + st = 3; + } else if ("standalone".equals(str) == true) { + if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] + { + panic(FAULT); + } + str = eqstr('=').toLowerCase(); + // Check the 'standalone' value and use it [#5.1] + if (str.equals("yes") == true) { + mIsSAlone = true; + } else if (str.equals("no") == true) { + mIsSAlone = false; + } else { + panic(FAULT); + } + mIsSAloneSet = true; + st = 4; + } else { + panic(FAULT); + } + break; + + case ' ': + break; + + case '?': + if (st == 1) { + panic(FAULT); + } + bkch(); + st = 4; + break; + + default: + panic(FAULT); + } + break; + + case 4: // end of xml declaration + switch (chtyp(ch)) { + case '?': + if (getch() != '>') { + panic(FAULT); + } + if (mPh <= PH_DOC_START) { + mPh = PH_MISC_DTD; // misc before DTD + } + st = -1; + break; + + case ' ': + break; + + default: + panic(FAULT); + } + break; + + default: + panic(FAULT); + } + } + return enc; + } + + /** + * Sets up the document reader. + * + * @param name an encoding name. + * @param is the document byte input stream. + * @return a reader constructed from encoding name and input stream. + * @exception UnsupportedEncodingException + */ + private Reader enc(String name, InputStream is) + throws UnsupportedEncodingException { + // DO NOT CLOSE current reader if any! + if (name.equals("UTF-8")) { + return new ReaderUTF8(is); + } else if (name.equals("UTF-16LE")) { + return new ReaderUTF16(is, 'l'); + } else if (name.equals("UTF-16BE")) { + return new ReaderUTF16(is, 'b'); + } else { + return new InputStreamReader(is, name); + } + } + + /** + * Sets up current input on the top of the input stack. + * + * @param inp A new input to set up. + */ + protected void push(Input inp) { + mInp.chLen = mChLen; + mInp.chIdx = mChIdx; + inp.next = mInp; + mInp = inp; + mChars = inp.chars; + mChLen = inp.chLen; + mChIdx = inp.chIdx; + } + + /** + * Restores previous input on the top of the input stack. + */ + protected void pop() { + if (mInp.src != null) { + try { + mInp.src.close(); + } catch (IOException ioe) { + } + mInp.src = null; + } + mInp = mInp.next; + if (mInp != null) { + mChars = mInp.chars; + mChLen = mInp.chLen; + mChIdx = mInp.chIdx; + } else { + mChars = null; + mChLen = 0; + mChIdx = 0; + } + } + + /** + * Maps a character to it's type. + * + * Possible character type values are:
- ' ' for any kind of white + * space character;
- 'a' for any lower case alphabetical character + * value;
- 'A' for any upper case alphabetical character value;
+ * - 'd' for any decimal digit character value;
- 'z' for any + * character less then ' ' except '\t', '\n', '\r';
- 'X' for any not + * ASCII character;
- 'Z' for EOS character.
An ASCII (7 bit) + * character which does not fall in any category listed above is mapped to + * it self. + * + * @param ch The character to map. + * @return The type of character. + */ + protected char chtyp(char ch) { + if (ch < 0x80) { + return (char) asctyp[ch]; + } + return (ch != EOS) ? 'X' : 'Z'; + } + + /** + * Retrives the next character in the document. + * + * @return The next character in the document. + */ + protected char getch() + throws IOException { + if (mChIdx >= mChLen) { + if (mInp.src == null) { + pop(); // remove internal entity + return getch(); + } + // Read new portion of the document characters + int Num = mInp.src.read(mChars, 0, mChars.length); + if (Num < 0) { + if (mInp != mDoc) { + pop(); // restore the previous input + return getch(); + } else { + mChars[0] = EOS; + mChLen = 1; + } + } else { + mChLen = Num; + } + mChIdx = 0; + } + return mChars[mChIdx++]; + } + + /** + * Puts back the last read character. + * + * This method MUST NOT be called more then once after each + * call of {@link #getch getch} method. + */ + protected void bkch() + throws Exception { + if (mChIdx <= 0) { + panic(FAULT); + } + mChIdx--; + } + + /** + * Sets the current character. + * + * @param ch The character to set. + */ + protected void setch(char ch) { + mChars[mChIdx] = ch; + } + + /** + * Finds a pair in the pair chain by a qualified name. + * + * @param chain The first element of the chain of pairs. + * @param qname The qualified name. + * @return A pair with the specified qualified name or null. + */ + protected Pair find(Pair chain, char[] qname) { + for (Pair pair = chain; pair != null; pair = pair.next) { + if (pair.eqname(qname) == true) { + return pair; + } + } + return null; + } + + /** + * Provedes an instance of a pair. + * + * @param next The reference to a next pair. + * @return An instance of a pair. + */ + protected Pair pair(Pair next) { + Pair pair; + + if (mDltd != null) { + pair = mDltd; + mDltd = pair.next; + } else { + pair = new Pair(); + } + pair.next = next; + + return pair; + } + + /** + * Deletes an instance of a pair. + * + * @param pair The pair to delete. + * @return A reference to the next pair in a chain. + */ + protected Pair del(Pair pair) { + Pair next = pair.next; + + pair.name = null; + pair.value = null; + pair.chars = null; + pair.list = null; + pair.next = mDltd; + mDltd = pair; + + return next; + } +}