/* * Copyright (c) 2012, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package jdk.internal.util.xml.impl; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import jdk.internal.org.xml.sax.InputSource; import jdk.internal.org.xml.sax.SAXException; /** * XML non-validating parser engine. */ public abstract class Parser { public static final String FAULT = ""; protected static final int BUFFSIZE_READER = 512; protected static final int BUFFSIZE_PARSER = 128; /** * The end of stream character. */ public static final char EOS = 0xffff; private Pair mNoNS; // there is no namespace private Pair mXml; // the xml namespace private Map mEnt; // the entities look up table private Map mPEnt; // the parmeter entities look up table protected boolean mIsSAlone; // xml decl standalone flag protected boolean mIsSAloneSet; // standalone is explicitely set protected boolean mIsNSAware; // if true - namespace aware mode protected int mPh; // current phase of document processing protected static final int PH_BEFORE_DOC = -1; // before parsing protected static final int PH_DOC_START = 0; // document start protected static final int PH_MISC_DTD = 1; // misc before DTD protected static final int PH_DTD = 2; // DTD protected static final int PH_DTD_MISC = 3; // misc after DTD protected static final int PH_DOCELM = 4; // document's element protected static final int PH_DOCELM_MISC = 5; // misc after element protected static final int PH_AFTER_DOC = 6; // after parsing protected int mEvt; // current event type protected static final int EV_NULL = 0; // unknown protected static final int EV_ELM = 1; // empty element protected static final int EV_ELMS = 2; // start element protected static final int EV_ELME = 3; // end element protected static final int EV_TEXT = 4; // textual content protected static final int EV_WSPC = 5; // white space content protected static final int EV_PI = 6; // processing instruction protected static final int EV_CDAT = 7; // character data protected static final int EV_COMM = 8; // comment protected static final int EV_DTD = 9; // document type definition protected static final int EV_ENT = 10; // skipped entity private char mESt; // built-in entity recognizer state // mESt values: // 0x100 : the initial state // > 0x100 : unrecognized name // < 0x100 : replacement character protected char[] mBuff; // parser buffer protected int mBuffIdx; // index of the last char protected Pair mPref; // stack of prefixes protected Pair mElm; // stack of elements // mAttL.chars - element qname // mAttL.next - next element // mAttL.list - list of attributes defined on this element // mAttL.list.chars - attribute qname // mAttL.list.id - a char representing attribute's type see below // mAttL.list.next - next attribute defined on the element // mAttL.list.list - devault value structure or null // mAttL.list.list.chars - "name='value' " chars array for Input // // Attribute type character values: // 'i' - "ID" // 'r' - "IDREF" // 'R' - "IDREFS" // 'n' - "ENTITY" // 'N' - "ENTITIES" // 't' - "NMTOKEN" // 'T' - "NMTOKENS" // 'u' - enumeration type // 'o' - "NOTATION" // 'c' - "CDATA" // see also: bkeyword() and atype() // protected Pair mAttL; // list of defined attrs by element name protected Input mDoc; // document entity protected Input mInp; // stack of entities private char[] mChars; // reading buffer private int mChLen; // current capacity private int mChIdx; // index to the next char protected Attrs mAttrs; // attributes of the curr. element private String[] mItems; // attributes array of the curr. element private char mAttrIdx; // attributes counter/index private String mUnent; // unresolved entity name private Pair mDltd; // deleted objects for reuse /** * Default prefixes */ private static final char NONS[]; private static final char XML[]; private static final char XMLNS[]; static { NONS = new char[1]; NONS[0] = (char) 0; XML = new char[4]; XML[0] = (char) 4; XML[1] = 'x'; XML[2] = 'm'; XML[3] = 'l'; XMLNS = new char[6]; XMLNS[0] = (char) 6; XMLNS[1] = 'x'; XMLNS[2] = 'm'; XMLNS[3] = 'l'; XMLNS[4] = 'n'; XMLNS[5] = 's'; } /** * ASCII character type array. * * This array maps an ASCII (7 bit) character to the character type.
* Possible character type values are:
- ' ' for any kind of white * space character;
- 'a' for any lower case alphabetical character * value;
- 'A' for any upper case alphabetical character value;
* - 'd' for any decimal digit character value;
- 'z' for any * character less than ' ' except '\t', '\n', '\r';
An ASCII (7 bit) * character which does not fall in any category listed above is mapped to * it self. */ private static final byte asctyp[]; /** * NMTOKEN character type array. * * This array maps an ASCII (7 bit) character to the character type.
* Possible character type values are:
- 0 for underscore ('_') or any * lower and upper case alphabetical character value;
- 1 for colon * (':') character;
- 2 for dash ('-') and dot ('.') or any decimal * digit character value;
- 3 for any kind of white space character
* An ASCII (7 bit) character which does not fall in any category listed * above is mapped to 0xff. */ private static final byte nmttyp[]; /** * Static constructor. * * Sets up the ASCII character type array which is used by * {@link #asctyp asctyp} method and NMTOKEN character type array. */ static { short i = 0; asctyp = new byte[0x80]; while (i < ' ') { asctyp[i++] = (byte) 'z'; } asctyp['\t'] = (byte) ' '; asctyp['\r'] = (byte) ' '; asctyp['\n'] = (byte) ' '; while (i < '0') { asctyp[i] = (byte) i++; } while (i <= '9') { asctyp[i++] = (byte) 'd'; } while (i < 'A') { asctyp[i] = (byte) i++; } while (i <= 'Z') { asctyp[i++] = (byte) 'A'; } while (i < 'a') { asctyp[i] = (byte) i++; } while (i <= 'z') { asctyp[i++] = (byte) 'a'; } while (i < 0x80) { asctyp[i] = (byte) i++; } nmttyp = new byte[0x80]; for (i = 0; i < '0'; i++) { nmttyp[i] = (byte) 0xff; } while (i <= '9') { nmttyp[i++] = (byte) 2; // digits } while (i < 'A') { nmttyp[i++] = (byte) 0xff; } // skiped upper case alphabetical character are already 0 for (i = '['; i < 'a'; i++) { nmttyp[i] = (byte) 0xff; } // skiped lower case alphabetical character are already 0 for (i = '{'; i < 0x80; i++) { nmttyp[i] = (byte) 0xff; } nmttyp['_'] = 0; nmttyp[':'] = 1; nmttyp['.'] = 2; nmttyp['-'] = 2; nmttyp[' '] = 3; nmttyp['\t'] = 3; nmttyp['\r'] = 3; nmttyp['\n'] = 3; } /** * Constructor. */ protected Parser() { mPh = PH_BEFORE_DOC; // before parsing // Initialize the parser mBuff = new char[BUFFSIZE_PARSER]; mAttrs = new Attrs(); // Default namespace mPref = pair(mPref); mPref.name = ""; mPref.value = ""; mPref.chars = NONS; mNoNS = mPref; // no namespace // XML namespace mPref = pair(mPref); mPref.name = "xml"; mPref.value = "http://www.w3.org/XML/1998/namespace"; mPref.chars = XML; mXml = mPref; // XML namespace } /** * Initializes parser's internals. Note, current input has to be set before * this method is called. */ protected void init() { mUnent = null; mElm = null; mPref = mXml; mAttL = null; mPEnt = new HashMap<>(); mEnt = new HashMap<>(); mDoc = mInp; // current input is document entity mChars = mInp.chars; // use document entity buffer mPh = PH_DOC_START; // the begining of the document } /** * Cleans up parser internal resources. */ protected void cleanup() { // Default attributes while (mAttL != null) { while (mAttL.list != null) { if (mAttL.list.list != null) { del(mAttL.list.list); } mAttL.list = del(mAttL.list); } mAttL = del(mAttL); } // Element stack while (mElm != null) { mElm = del(mElm); } // Namespace prefixes while (mPref != mXml) { mPref = del(mPref); } // Inputs while (mInp != null) { pop(); } // Document reader if ((mDoc != null) && (mDoc.src != null)) { try { mDoc.src.close(); } catch (IOException ioe) { } } mPEnt = null; mEnt = null; mDoc = null; mPh = PH_AFTER_DOC; // before documnet processing } /** * Processes a portion of document. This method returns one of EV_* * constants as an identifier of the portion of document have been read. * * @return Identifier of processed document portion. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") protected int step() throws Exception { mEvt = EV_NULL; int st = 0; while (mEvt == EV_NULL) { char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); switch (st) { case 0: // all sorts of markup (dispetcher) if (ch != '<') { bkch(); mBuffIdx = -1; // clean parser buffer st = 1; break; } switch (getch()) { case '/': // the end of the element content mEvt = EV_ELME; if (mElm == null) { panic(FAULT); } // Check element's open/close tags balance mBuffIdx = -1; // clean parser buffer bname(mIsNSAware); char[] chars = mElm.chars; if (chars.length == (mBuffIdx + 1)) { for (char i = 1; i <= mBuffIdx; i += 1) { if (chars[i] != mBuff[i]) { panic(FAULT); } } } else { panic(FAULT); } // Skip white spaces before '>' if (wsskip() != '>') { panic(FAULT); } getch(); // read '>' break; case '!': // a comment or a CDATA ch = getch(); bkch(); switch (ch) { case '-': // must be a comment mEvt = EV_COMM; comm(); break; case '[': // must be a CDATA section mEvt = EV_CDAT; cdat(); break; default: // must be 'DOCTYPE' mEvt = EV_DTD; dtd(); break; } break; case '?': // processing instruction mEvt = EV_PI; pi(); break; default: // must be the first char of an xml name bkch(); // Read an element name and put it on top of the // element stack mElm = pair(mElm); // add new element to the stack mElm.chars = qname(mIsNSAware); mElm.name = mElm.local(); mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags mElm.num = 0; // namespace counter // Find the list of defined attributs of the current // element Pair elm = find(mAttL, mElm.chars); mElm.list = (elm != null) ? elm.list : null; // Read attributes till the end of the element tag mAttrIdx = 0; Pair att = pair(null); att.num = 0; // clear attribute's flags attr(att); // get all attributes inc. defaults del(att); mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; // Skip white spaces before '>' switch (wsskip()) { case '>': getch(); // read '>' mEvt = EV_ELMS; break; case '/': getch(); // read '/' if (getch() != '>') // read '>' { panic(FAULT); } mEvt = EV_ELM; break; default: panic(FAULT); } break; } break; case 1: // read white space switch (ch) { case ' ': case '\t': case '\n': bappend(ch); break; case '\r': // EOL processing [#2.11] if (getch() != '\n') { bkch(); } bappend('\n'); break; case '<': mEvt = EV_WSPC; bkch(); bflash_ws(); break; default: bkch(); st = 2; break; } break; case 2: // read the text content of the element switch (ch) { case '&': if (mUnent == null) { // There was no unresolved entity on previous step. if ((mUnent = ent('x')) != null) { mEvt = EV_TEXT; bkch(); // move back to ';' after entity name setch('&'); // parser must be back on next step bflash(); } } else { // There was unresolved entity on previous step. mEvt = EV_ENT; skippedEnt(mUnent); mUnent = null; } break; case '<': mEvt = EV_TEXT; bkch(); bflash(); break; case '\r': // EOL processing [#2.11] if (getch() != '\n') { bkch(); } bappend('\n'); break; case EOS: panic(FAULT); default: bappend(ch); break; } break; default: panic(FAULT); } } return mEvt; } /** * Parses the document type declaration. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtd() throws Exception { char ch; String str = null; String name = null; Pair psid = null; // read 'DOCTYPE' if ("DOCTYPE".equals(name(false)) != true) { panic(FAULT); } mPh = PH_DTD; // DTD for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // read the document type name if (chtyp(ch) != ' ') { bkch(); name = name(mIsNSAware); wsskip(); st = 1; // read 'PUPLIC' or 'SYSTEM' } break; case 1: // read 'PUPLIC' or 'SYSTEM' switch (chtyp(ch)) { case 'A': bkch(); psid = pubsys(' '); st = 2; // skip spaces before internal subset docType(name, psid.name, psid.value); break; case '[': bkch(); st = 2; // skip spaces before internal subset docType(name, null, null); break; case '>': bkch(); st = 3; // skip spaces after internal subset docType(name, null, null); break; default: panic(FAULT); } break; case 2: // skip spaces before internal subset switch (chtyp(ch)) { case '[': // Process internal subset dtdsub(); st = 3; // skip spaces after internal subset break; case '>': // There is no internal subset bkch(); st = 3; // skip spaces after internal subset break; case ' ': // skip white spaces break; default: panic(FAULT); } break; case 3: // skip spaces after internal subset switch (chtyp(ch)) { case '>': if (psid != null) { // Report the DTD external subset InputSource is = resolveEnt(name, psid.name, psid.value); if (is != null) { if (mIsSAlone == false) { // Set the end of DTD external subset char bkch(); setch(']'); // Set the DTD external subset InputSource push(new Input(BUFFSIZE_READER)); setinp(is); mInp.pubid = psid.name; mInp.sysid = psid.value; // Parse the DTD external subset dtdsub(); } else { // Unresolved DTD external subset skippedEnt("[dtd]"); // Release reader and stream if (is.getCharacterStream() != null) { try { is.getCharacterStream().close(); } catch (IOException ioe) { } } if (is.getByteStream() != null) { try { is.getByteStream().close(); } catch (IOException ioe) { } } } } else { // Unresolved DTD external subset skippedEnt("[dtd]"); } del(psid); } st = -1; // end of DTD break; case ' ': // skip white spaces break; default: panic(FAULT); } break; default: panic(FAULT); } } } /** * Parses the document type declaration subset. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdsub() throws Exception { startInternalSub(); // reports the event before parsing the subset char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // skip white spaces before a declaration switch (chtyp(ch)) { case '<': ch = getch(); switch (ch) { case '?': pi(); break; case '!': ch = getch(); bkch(); if (ch == '-') { comm(); break; } // A markup or an entity declaration bntok(); switch (bkeyword()) { case 'n': dtdent(); break; case 'a': dtdattl(); // parse attributes declaration break; case 'e': dtdelm(); // parse element declaration break; case 'o': dtdnot(); // parse notation declaration break; default: panic(FAULT); // unsupported markup declaration break; } st = 1; // read the end of declaration break; default: panic(FAULT); break; } break; case '%': // A parameter entity reference pent(' '); break; case ']': // End of DTD subset st = -1; break; case ' ': // Skip white spaces break; case 'Z': // End of stream if (getch() != ']') { panic(FAULT); } st = -1; break; default: panic(FAULT); } break; case 1: // read the end of declaration switch (ch) { case '>': // there is no notation st = 0; // skip white spaces before a declaration break; case ' ': case '\n': case '\r': case '\t': // Skip white spaces break; default: panic(FAULT); break; } break; default: panic(FAULT); } } } /** * Parses an entity declaration. This method fills the general ( * mEnt) and parameter * ( * mPEnt) entity look up table. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdent() throws Exception { String str = null; char[] val = null; Input inp = null; Pair ids = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // skip white spaces before entity name switch (chtyp(ch)) { case ' ': // Skip white spaces break; case '%': // Parameter entity or parameter entity declaration. ch = getch(); bkch(); if (chtyp(ch) == ' ') { // Parameter entity declaration. wsskip(); str = name(false); switch (chtyp(wsskip())) { case 'A': // Read the external identifier ids = pubsys(' '); if (wsskip() == '>') { // External parsed entity if (mPEnt.containsKey(str) == false) { // [#4.2] inp = new Input(); inp.pubid = ids.name; inp.sysid = ids.value; mPEnt.put(str, inp); } } else { panic(FAULT); } del(ids); st = -1; // the end of declaration break; case '\"': case '\'': // Read the parameter entity value bqstr('d'); // Create the parameter entity value val = new char[mBuffIdx + 1]; System.arraycopy(mBuff, 1, val, 1, val.length - 1); // Add surrounding spaces [#4.4.8] val[0] = ' '; // Add the entity to the entity look up table if (mPEnt.containsKey(str) == false) { // [#4.2] inp = new Input(val); inp.pubid = mInp.pubid; inp.sysid = mInp.sysid; inp.xmlenc = mInp.xmlenc; inp.xmlver = mInp.xmlver; mPEnt.put(str, inp); } st = -1; // the end of declaration break; default: panic(FAULT); break; } } else { // Parameter entity reference. pent(' '); } break; default: bkch(); str = name(false); st = 1; // read entity declaration value break; } break; case 1: // read entity declaration value switch (chtyp(ch)) { case '\"': // internal entity case '\'': bkch(); bqstr('d'); // read a string into the buffer if (mEnt.get(str) == null) { // Create general entity value val = new char[mBuffIdx]; System.arraycopy(mBuff, 1, val, 0, val.length); // Add the entity to the entity look up table if (mEnt.containsKey(str) == false) { // [#4.2] inp = new Input(val); inp.pubid = mInp.pubid; inp.sysid = mInp.sysid; inp.xmlenc = mInp.xmlenc; inp.xmlver = mInp.xmlver; mEnt.put(str, inp); } } st = -1; // the end of declaration break; case 'A': // external entity bkch(); ids = pubsys(' '); switch (wsskip()) { case '>': // external parsed entity if (mEnt.containsKey(str) == false) { // [#4.2] inp = new Input(); inp.pubid = ids.name; inp.sysid = ids.value; mEnt.put(str, inp); } break; case 'N': // external general unparsed entity if ("NDATA".equals(name(false)) == true) { wsskip(); unparsedEntDecl(str, ids.name, ids.value, name(false)); break; } default: panic(FAULT); break; } del(ids); st = -1; // the end of declaration break; case ' ': // Skip white spaces break; default: panic(FAULT); break; } break; default: panic(FAULT); } } } /** * Parses an element declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdelm() throws Exception { // This is stub implementation which skips an element // declaration. wsskip(); name(mIsNSAware); char ch; while (true) { ch = getch(); switch (ch) { case '>': bkch(); return; case EOS: panic(FAULT); default: break; } } } /** * Parses an attribute list declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdattl() throws Exception { char elmqn[] = null; Pair elm = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // read the element name switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); // Get the element from the list or add a new one. elmqn = qname(mIsNSAware); elm = find(mAttL, elmqn); if (elm == null) { elm = pair(mAttL); elm.chars = elmqn; mAttL = elm; } st = 1; // read an attribute declaration break; case ' ': break; case '%': pent(' '); break; default: panic(FAULT); break; } break; case 1: // read an attribute declaration switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); dtdatt(elm); if (wsskip() == '>') { return; } break; case ' ': break; case '%': pent(' '); break; default: panic(FAULT); break; } break; default: panic(FAULT); break; } } } /** * Parses an attribute declaration. * * The attribute uses the following fields of Pair object: chars - characters * of qualified name id - the type identifier of the attribute list - a pair * which holds the default value (chars field) * * @param elm An object which represents all defined attributes on an * element. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdatt(Pair elm) throws Exception { char attqn[] = null; Pair att = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // the attribute name switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); // Get the attribute from the list or add a new one. attqn = qname(mIsNSAware); att = find(elm.list, attqn); if (att == null) { // New attribute declaration att = pair(elm.list); att.chars = attqn; elm.list = att; } else { // Do not override the attribute declaration [#3.3] att = pair(null); att.chars = attqn; att.id = 'c'; } wsskip(); st = 1; break; case '%': pent(' '); break; case ' ': break; default: panic(FAULT); break; } break; case 1: // the attribute type switch (chtyp(ch)) { case '(': att.id = 'u'; // enumeration type st = 2; // read the first element of the list break; case '%': pent(' '); break; case ' ': break; default: bkch(); bntok(); // read type id att.id = bkeyword(); switch (att.id) { case 'o': // NOTATION if (wsskip() != '(') { panic(FAULT); } ch = getch(); st = 2; // read the first element of the list break; case 'i': // ID case 'r': // IDREF case 'R': // IDREFS case 'n': // ENTITY case 'N': // ENTITIES case 't': // NMTOKEN case 'T': // NMTOKENS case 'c': // CDATA wsskip(); st = 4; // read default declaration break; default: panic(FAULT); break; } break; } break; case 2: // read the first element of the list switch (chtyp(ch)) { case 'a': case 'A': case 'd': case '.': case ':': case '-': case '_': case 'X': bkch(); switch (att.id) { case 'u': // enumeration type bntok(); break; case 'o': // NOTATION mBuffIdx = -1; bname(false); break; default: panic(FAULT); break; } wsskip(); st = 3; // read next element of the list break; case '%': pent(' '); break; case ' ': break; default: panic(FAULT); break; } break; case 3: // read next element of the list switch (ch) { case ')': wsskip(); st = 4; // read default declaration break; case '|': wsskip(); switch (att.id) { case 'u': // enumeration type bntok(); break; case 'o': // NOTATION mBuffIdx = -1; bname(false); break; default: panic(FAULT); break; } wsskip(); break; case '%': pent(' '); break; default: panic(FAULT); break; } break; case 4: // read default declaration switch (ch) { case '#': bntok(); switch (bkeyword()) { case 'F': // FIXED switch (wsskip()) { case '\"': case '\'': st = 5; // read the default value break; case EOS: panic(FAULT); default: st = -1; break; } break; case 'Q': // REQUIRED case 'I': // IMPLIED st = -1; break; default: panic(FAULT); break; } break; case '\"': case '\'': bkch(); st = 5; // read the default value break; case ' ': case '\n': case '\r': case '\t': break; case '%': pent(' '); break; default: bkch(); st = -1; break; } break; case 5: // read the default value switch (ch) { case '\"': case '\'': bkch(); bqstr('d'); // the value in the mBuff now att.list = pair(null); // Create a string like "attqname='value' " att.list.chars = new char[att.chars.length + mBuffIdx + 3]; System.arraycopy( att.chars, 1, att.list.chars, 0, att.chars.length - 1); att.list.chars[att.chars.length - 1] = '='; att.list.chars[att.chars.length] = ch; System.arraycopy( mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); att.list.chars[att.chars.length + mBuffIdx + 1] = ch; att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; st = -1; break; default: panic(FAULT); break; } break; default: panic(FAULT); break; } } } /** * Parses a notation declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdnot() throws Exception { wsskip(); String name = name(false); wsskip(); Pair ids = pubsys('N'); notDecl(name, ids.name, ids.value); del(ids); } /** * Parses an attribute. * * This recursive method is responsible for prefix addition * ( * mPref) on the way down. The element's start tag end triggers * the return process. The method then on it's way back resolves prefixes * and accumulates attributes. * *

att.num carries attribute flags where: 0x1 - attribute is * declared in DTD (attribute decalration had been read); 0x2 - attribute's * default value is used.

* * @param att An object which reprecents current attribute. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void attr(Pair att) throws Exception { switch (wsskip()) { case '/': case '>': if ((att.num & 0x2) == 0) { // all attributes have been read att.num |= 0x2; // set default attribute flag Input inp = mInp; // Go through all attributes defined on current element. for (Pair def = mElm.list; def != null; def = def.next) { if (def.list == null) // no default value { continue; } // Go through all attributes defined on current // element and add defaults. Pair act = find(att.next, def.chars); if (act == null) { push(new Input(def.list.chars)); } } if (mInp != inp) { // defaults have been added attr(att); return; } } // Ensure the attribute string array capacity mAttrs.setLength(mAttrIdx); mItems = mAttrs.mItems; return; case EOS: panic(FAULT); default: // Read the attribute name and value att.chars = qname(mIsNSAware); att.name = att.local(); String type = atype(att); // sets attribute's type on att.id wsskip(); if (getch() != '=') { panic(FAULT); } bqstr((char) att.id); // read the value with normalization. String val = new String(mBuff, 1, mBuffIdx); Pair next = pair(att); next.num = (att.num & ~0x1); // inherit attribute flags // Put a namespace declaration on top of the prefix stack if ((mIsNSAware == false) || (isdecl(att, val) == false)) { // An ordinary attribute mAttrIdx++; attr(next); // recursive call to parse the next attribute mAttrIdx--; // Add the attribute to the attributes string array char idx = (char) (mAttrIdx << 3); mItems[idx + 1] = att.qname(); // attr qname mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name mItems[idx + 3] = val; // attr value mItems[idx + 4] = type; // attr type switch (att.num & 0x3) { case 0x0: mItems[idx + 5] = null; break; case 0x1: // declared attribute mItems[idx + 5] = "d"; break; default: // 0x2, 0x3 - default attribute always declared mItems[idx + 5] = "D"; break; } // Resolve the prefix if any and report the attribute // NOTE: The attribute does not accept the default namespace. mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; } else { // A namespace declaration. mPref.name contains prefix and // mPref.value contains namespace URI set by isdecl method. // Report a start of the new mapping newPrefix(); // Recursive call to parse the next attribute attr(next); // NOTE: The namespace declaration is not reported. } del(next); break; } } /** * Retrieves attribute type. * * This method sets the type of normalization in the attribute * id field and returns the name of attribute type. * * @param att An object which represents current attribute. * @return The name of the attribute type. * @exception Exception is parser specific exception form panic method. */ private String atype(Pair att) throws Exception { Pair attr; // CDATA-type normalization by default [#3.3.3] att.id = 'c'; if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { return "CDATA"; } att.num |= 0x1; // attribute is declared // Non-CDATA normalization except when the attribute type is CDATA. att.id = 'i'; switch (attr.id) { case 'i': return "ID"; case 'r': return "IDREF"; case 'R': return "IDREFS"; case 'n': return "ENTITY"; case 'N': return "ENTITIES"; case 't': return "NMTOKEN"; case 'T': return "NMTOKENS"; case 'u': return "NMTOKEN"; case 'o': return "NOTATION"; case 'c': att.id = 'c'; return "CDATA"; default: panic(FAULT); } return null; } /** * Parses a comment. * * The '<!' part is read in dispatcher so the method starts * with first '-' after '<!'. * * @exception Exception is parser specific exception form panic method. */ @SuppressWarnings("fallthrough") private void comm() throws Exception { if (mPh == PH_DOC_START) { mPh = PH_MISC_DTD; // misc before DTD } // '= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); if (ch == EOS) { panic(FAULT); } switch (st) { case 0: // first '-' of the comment open if (ch == '-') { st = 1; } else { panic(FAULT); } break; case 1: // secind '-' of the comment open if (ch == '-') { st = 2; } else { panic(FAULT); } break; case 2: // skip the comment body switch (ch) { case '-': st = 3; break; default: bappend(ch); break; } break; case 3: // second '-' of the comment close switch (ch) { case '-': st = 4; break; default: bappend('-'); bappend(ch); st = 2; break; } break; case 4: // '>' of the comment close if (ch == '>') { comm(mBuff, mBuffIdx + 1); st = -1; break; } // else - panic [#2.5 compatibility note] default: panic(FAULT); } } } /** * Parses a processing instruction. * * The '<?' is read in dispatcher so the method starts with * first character of PI target name after '<?'. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void pi() throws Exception { // '= 0;) { ch = getch(); if (ch == EOS) { panic(FAULT); } switch (st) { case 0: // read the PI target name switch (chtyp(ch)) { case 'a': case 'A': case '_': case ':': case 'X': bkch(); str = name(false); // PI target name may not be empty string [#2.6] // PI target name 'XML' is reserved [#2.6] if ((str.length() == 0) || (mXml.name.equals(str.toLowerCase()) == true)) { panic(FAULT); } // This is processing instruction if (mPh == PH_DOC_START) // the begining of the document { mPh = PH_MISC_DTD; // misc before DTD } wsskip(); // skip spaces after the PI target name st = 1; // accumulate the PI body mBuffIdx = -1; break; default: panic(FAULT); } break; case 1: // accumulate the PI body switch (ch) { case '?': st = 2; // end of the PI body break; default: bappend(ch); break; } break; case 2: // end of the PI body switch (ch) { case '>': // PI has been read. pi(str, new String(mBuff, 0, mBuffIdx + 1)); st = -1; break; case '?': bappend('?'); break; default: bappend('?'); bappend(ch); st = 1; // accumulate the PI body break; } break; default: panic(FAULT); } } } /** * Parses a character data. * * The '<!' part is read in dispatcher so the method starts * with first '[' after '<!'. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void cdat() throws Exception { // '= 0;) { ch = getch(); switch (st) { case 0: // the first '[' of the CDATA open if (ch == '[') { st = 1; } else { panic(FAULT); } break; case 1: // read "CDATA" if (chtyp(ch) == 'A') { bappend(ch); } else { if ("CDATA".equals( new String(mBuff, 0, mBuffIdx + 1)) != true) { panic(FAULT); } bkch(); st = 2; } break; case 2: // the second '[' of the CDATA open if (ch != '[') { panic(FAULT); } mBuffIdx = -1; st = 3; break; case 3: // read data before the first ']' if (ch != ']') { bappend(ch); } else { st = 4; } break; case 4: // read the second ']' or continue to read the data if (ch != ']') { bappend(']'); bappend(ch); st = 3; } else { st = 5; } break; case 5: // read '>' or continue to read the data switch (ch) { case ']': bappend(']'); break; case '>': bflash(); st = -1; break; default: bappend(']'); bappend(']'); bappend(ch); st = 3; break; } break; default: panic(FAULT); } } } /** * Reads a xml name. * * The xml name must conform "Namespaces in XML" specification. Therefore * the ':' character is not allowed in the name. This method should be used * for PI and entity names which may not have a namespace according to the * specification mentioned above. * * @param ns The true value turns namespace conformance on. * @return The name has been read. * @exception Exception When incorrect character appear in the name. * @exception IOException */ protected String name(boolean ns) throws Exception { mBuffIdx = -1; bname(ns); return new String(mBuff, 1, mBuffIdx); } /** * Reads a qualified xml name. * * The characters of a qualified name is an array of characters. The first * (chars[0]) character is the index of the colon character which separates * the prefix from the local name. If the index is zero, the name does not * contain separator or the parser works in the namespace unaware mode. The * length of qualified name is the length of the array minus one. * * @param ns The true value turns namespace conformance on. * @return The characters of a qualified name. * @exception Exception When incorrect character appear in the name. * @exception IOException */ protected char[] qname(boolean ns) throws Exception { mBuffIdx = -1; bname(ns); char chars[] = new char[mBuffIdx + 1]; System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); return chars; } /** * Reads the public or/and system identifiers. * * @param inp The input object. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void pubsys(Input inp) throws Exception { Pair pair = pubsys(' '); inp.pubid = pair.name; inp.sysid = pair.value; del(pair); } /** * Reads the public or/and system identifiers. * * @param flag The 'N' allows public id be without system id. * @return The public or/and system identifiers pair. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private Pair pubsys(char flag) throws Exception { Pair ids = pair(null); String str = name(false); if ("PUBLIC".equals(str) == true) { bqstr('i'); // non-CDATA normalization [#4.2.2] ids.name = new String(mBuff, 1, mBuffIdx); switch (wsskip()) { case '\"': case '\'': bqstr(' '); ids.value = new String(mBuff, 1, mBuffIdx); break; case EOS: panic(FAULT); default: if (flag != 'N') // [#4.7] { panic(FAULT); } ids.value = null; break; } return ids; } else if ("SYSTEM".equals(str) == true) { ids.name = null; bqstr(' '); ids.value = new String(mBuff, 1, mBuffIdx); return ids; } panic(FAULT); return null; } /** * Reads an attribute value. * * The grammar this method can read is: *
{@code
     * eqstr := S "=" qstr
     * qstr  := S ("'" string "'") | ('"' string '"')
     * }
* This method resolves entities * inside a string unless the parser parses DTD. * * @param flag The '=' character forces the method to accept the '=' * character before quoted string and read the following string as not an * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; * '-' - not an attribute value; 'd' - in DTD context. * @return The content of the quoted strign as a string. * @exception Exception is parser specific exception form panic method. * @exception IOException */ protected String eqstr(char flag) throws Exception { if (flag == '=') { wsskip(); if (getch() != '=') { panic(FAULT); } } bqstr((flag == '=') ? '-' : flag); return new String(mBuff, 1, mBuffIdx); } /** * Resoves an entity. * * This method resolves built-in and character entity references. It is also * reports external entities to the application. * * @param flag The 'x' character forces the method to report a skipped * entity; 'i' character - indicates non-CDATA normalization. * @return Name of unresolved entity or null if entity had been * resolved successfully. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private String ent(char flag) throws Exception { char ch; int idx = mBuffIdx + 1; Input inp = null; String str = null; mESt = 0x100; // reset the built-in entity recognizer bappend('&'); for (short st = 0; st >= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); switch (st) { case 0: // the first character of the entity name case 1: // read built-in entity name switch (chtyp(ch)) { case 'd': case '.': case '-': if (st != 1) { panic(FAULT); } case 'a': case 'A': case '_': case 'X': bappend(ch); eappend(ch); st = 1; break; case ':': if (mIsNSAware != false) { panic(FAULT); } bappend(ch); eappend(ch); st = 1; break; case ';': if (mESt < 0x100) { // The entity is a built-in entity mBuffIdx = idx - 1; bappend(mESt); st = -1; break; } else if (mPh == PH_DTD) { // In DTD entity declaration has to resolve character // entities and include "as is" others. [#4.4.7] bappend(';'); st = -1; break; } // Convert an entity name to a string str = new String(mBuff, idx + 1, mBuffIdx - idx); inp = mEnt.get(str); // Restore the buffer offset mBuffIdx = idx - 1; if (inp != null) { if (inp.chars == null) { // External entity InputSource is = resolveEnt(str, inp.pubid, inp.sysid); if (is != null) { push(new Input(BUFFSIZE_READER)); setinp(is); mInp.pubid = inp.pubid; mInp.sysid = inp.sysid; str = null; // the entity is resolved } else { // Unresolved external entity if (flag != 'x') { panic(FAULT); // unknown entity within marckup } // str is name of unresolved entity } } else { // Internal entity push(inp); str = null; // the entity is resolved } } else { // Unknown or general unparsed entity if (flag != 'x') { panic(FAULT); // unknown entity within marckup } // str is name of unresolved entity } st = -1; break; case '#': if (st != 0) { panic(FAULT); } st = 2; break; default: panic(FAULT); } break; case 2: // read character entity switch (chtyp(ch)) { case 'd': bappend(ch); break; case ';': // Convert the character entity to a character try { int i = Integer.parseInt( new String(mBuff, idx + 1, mBuffIdx - idx), 10); if (i >= 0xffff) { panic(FAULT); } ch = (char) i; } catch (NumberFormatException nfe) { panic(FAULT); } // Restore the buffer offset mBuffIdx = idx - 1; if (ch == ' ' || mInp.next != null) { bappend(ch, flag); } else { bappend(ch); } st = -1; break; case 'a': // If the entity buffer is empty and ch == 'x' if ((mBuffIdx == idx) && (ch == 'x')) { st = 3; break; } default: panic(FAULT); } break; case 3: // read hex character entity switch (chtyp(ch)) { case 'A': case 'a': case 'd': bappend(ch); break; case ';': // Convert the character entity to a character try { int i = Integer.parseInt( new String(mBuff, idx + 1, mBuffIdx - idx), 16); if (i >= 0xffff) { panic(FAULT); } ch = (char) i; } catch (NumberFormatException nfe) { panic(FAULT); } // Restore the buffer offset mBuffIdx = idx - 1; if (ch == ' ' || mInp.next != null) { bappend(ch, flag); } else { bappend(ch); } st = -1; break; default: panic(FAULT); } break; default: panic(FAULT); } } return str; } /** * Resoves a parameter entity. * * This method resolves a parameter entity references. It is also reports * external entities to the application. * * @param flag The '-' instruct the method to do not set up surrounding * spaces [#4.4.8]. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void pent(char flag) throws Exception { char ch; int idx = mBuffIdx + 1; Input inp = null; String str = null; bappend('%'); if (mPh != PH_DTD) // the DTD internal subset { return; // Not Recognized [#4.4.1] } // Read entity name bname(false); str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); if (getch() != ';') { panic(FAULT); } inp = mPEnt.get(str); // Restore the buffer offset mBuffIdx = idx - 1; if (inp != null) { if (inp.chars == null) { // External parameter entity InputSource is = resolveEnt(str, inp.pubid, inp.sysid); if (is != null) { if (flag != '-') { bappend(' '); // tail space } push(new Input(BUFFSIZE_READER)); // BUG: there is no leading space! [#4.4.8] setinp(is); mInp.pubid = inp.pubid; mInp.sysid = inp.sysid; } else { // Unresolved external parameter entity skippedEnt("%" + str); } } else { // Internal parameter entity if (flag == '-') { // No surrounding spaces inp.chIdx = 1; } else { // Insert surrounding spaces bappend(' '); // tail space inp.chIdx = 0; } push(inp); } } else { // Unknown parameter entity skippedEnt("%" + str); } } /** * Recognizes and handles a namespace declaration. * * This method identifies a type of namespace declaration if any and puts * new mapping on top of prefix stack. * * @param name The attribute qualified name (name.value is a * String object which represents the attribute prefix). * @param value The attribute value. * @return true if a namespace declaration is recognized. */ private boolean isdecl(Pair name, String value) { if (name.chars[0] == 0) { if ("xmlns".equals(name.name) == true) { // New default namespace declaration mPref = pair(mPref); mPref.list = mElm; // prefix owner element mPref.value = value; mPref.name = ""; mPref.chars = NONS; mElm.num++; // namespace counter return true; } } else { if (name.eqpref(XMLNS) == true) { // New prefix declaration int len = name.name.length(); mPref = pair(mPref); mPref.list = mElm; // prefix owner element mPref.value = value; mPref.name = name.name; mPref.chars = new char[len + 1]; mPref.chars[0] = (char) (len + 1); name.name.getChars(0, len, mPref.chars, 1); mElm.num++; // namespace counter return true; } } return false; } /** * Resolves a prefix. * * @return The namespace assigned to the prefix. * @exception Exception When mapping for specified prefix is not found. */ private String rslv(char[] qname) throws Exception { for (Pair pref = mPref; pref != null; pref = pref.next) { if (pref.eqpref(qname) == true) { return pref.value; } } if (qname[0] == 1) { // QNames like ':local' for (Pair pref = mPref; pref != null; pref = pref.next) { if (pref.chars[0] == 0) { return pref.value; } } } panic(FAULT); return null; } /** * Skips xml white space characters. * * This method skips white space characters (' ', '\t', '\n', '\r') and * looks ahead not white space character. * * @return The first not white space look ahead character. * @exception IOException */ protected char wsskip() throws IOException { char ch; while (true) { // Read next character ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); if (ch < 0x80) { if (nmttyp[ch] != 3) // [ \t\n\r] { break; } } else { break; } } mChIdx--; // bkch(); return ch; } /** * Reports document type. * * @param name The name of the entity. * @param pubid The public identifier of the entity or null. * @param sysid The system identifier of the entity or null. */ protected abstract void docType(String name, String pubid, String sysid) throws SAXException; /** * Reports the start of DTD internal subset. * * @throws SAXException if the receiver throws SAXException */ public abstract void startInternalSub () throws SAXException; /** * Reports a comment. * * @param text The comment text starting from first charcater. * @param length The number of characters in comment. */ protected abstract void comm(char[] text, int length); /** * Reports a processing instruction. * * @param target The processing instruction target name. * @param body The processing instruction body text. */ protected abstract void pi(String target, String body) throws Exception; /** * Reports new namespace prefix. The Namespace prefix ( * mPref.name) being declared and the Namespace URI ( * mPref.value) the prefix is mapped to. An empty string is * used for the default element namespace, which has no prefix. */ protected abstract void newPrefix() throws Exception; /** * Reports skipped entity name. * * @param name The entity name. */ protected abstract void skippedEnt(String name) throws Exception; /** * Returns an * InputSource for specified entity or * null. * * @param name The name of the entity. * @param pubid The public identifier of the entity. * @param sysid The system identifier of the entity. */ protected abstract InputSource resolveEnt( String name, String pubid, String sysid) throws Exception; /** * Reports notation declaration. * * @param name The notation's name. * @param pubid The notation's public identifier, or null if none was given. * @param sysid The notation's system identifier, or null if none was given. */ protected abstract void notDecl(String name, String pubid, String sysid) throws Exception; /** * Reports unparsed entity name. * * @param name The unparsed entity's name. * @param pubid The entity's public identifier, or null if none was given. * @param sysid The entity's system identifier. * @param notation The name of the associated notation. */ protected abstract void unparsedEntDecl( String name, String pubid, String sysid, String notation) throws Exception; /** * Notifies the handler about fatal parsing error. * * @param msg The problem description message. */ protected abstract void panic(String msg) throws Exception; /** * Reads a qualified xml name. * * This is low level routine which leaves a qName in the buffer. The * characters of a qualified name is an array of characters. The first * (chars[0]) character is the index of the colon character which separates * the prefix from the local name. If the index is zero, the name does not * contain separator or the parser works in the namespace unaware mode. The * length of qualified name is the length of the array minus one. * * @param ns The true value turns namespace conformance on. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void bname(boolean ns) throws Exception { char ch; char type; mBuffIdx++; // allocate a char for colon offset int bqname = mBuffIdx; int bcolon = bqname; int bchidx = bqname + 1; int bstart = bchidx; int cstart = mChIdx; short st = (short) ((ns == true) ? 0 : 2); while (true) { // Read next character if (mChIdx >= mChLen) { bcopy(cstart, bstart); getch(); mChIdx--; // bkch(); cstart = mChIdx; bstart = bchidx; } ch = mChars[mChIdx++]; type = (char) 0; // [X] if (ch < 0x80) { type = (char) nmttyp[ch]; } else if (ch == EOS) { panic(FAULT); } // Parse QName switch (st) { case 0: // read the first char of the prefix case 2: // read the first char of the suffix switch (type) { case 0: // [aA_X] bchidx++; // append char to the buffer st++; // (st == 0)? 1: 3; break; case 1: // [:] mChIdx--; // bkch(); st++; // (st == 0)? 1: 3; break; default: panic(FAULT); } break; case 1: // read the prefix case 3: // read the suffix switch (type) { case 0: // [aA_X] case 2: // [.-d] bchidx++; // append char to the buffer break; case 1: // [:] bchidx++; // append char to the buffer if (ns == true) { if (bcolon != bqname) { panic(FAULT); // it must be only one colon } bcolon = bchidx - 1; if (st == 1) { st = 2; } } break; default: mChIdx--; // bkch(); bcopy(cstart, bstart); mBuff[bqname] = (char) (bcolon - bqname); return; } break; default: panic(FAULT); } } } /** * Reads a nmtoken. * * This is low level routine which leaves a nmtoken in the buffer. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void bntok() throws Exception { char ch; mBuffIdx = -1; bappend((char) 0); // default offset to the colon char while (true) { ch = getch(); switch (chtyp(ch)) { case 'a': case 'A': case 'd': case '.': case ':': case '-': case '_': case 'X': bappend(ch); break; case 'Z': panic(FAULT); default: bkch(); return; } } } /** * Recognizes a keyword. * * This is low level routine which recognizes one of keywords in the buffer. * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - * Q IMPLIED - I FIXED - F * * @return an id of a keyword or '?'. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private char bkeyword() throws Exception { String str = new String(mBuff, 1, mBuffIdx); switch (str.length()) { case 2: // ID return ("ID".equals(str) == true) ? 'i' : '?'; case 5: // IDREF, CDATA, FIXED switch (mBuff[1]) { case 'I': return ("IDREF".equals(str) == true) ? 'r' : '?'; case 'C': return ("CDATA".equals(str) == true) ? 'c' : '?'; case 'F': return ("FIXED".equals(str) == true) ? 'F' : '?'; default: break; } break; case 6: // IDREFS, ENTITY switch (mBuff[1]) { case 'I': return ("IDREFS".equals(str) == true) ? 'R' : '?'; case 'E': return ("ENTITY".equals(str) == true) ? 'n' : '?'; default: break; } break; case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT switch (mBuff[1]) { case 'I': return ("IMPLIED".equals(str) == true) ? 'I' : '?'; case 'N': return ("NMTOKEN".equals(str) == true) ? 't' : '?'; case 'A': return ("ATTLIST".equals(str) == true) ? 'a' : '?'; case 'E': return ("ELEMENT".equals(str) == true) ? 'e' : '?'; default: break; } break; case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED switch (mBuff[2]) { case 'N': return ("ENTITIES".equals(str) == true) ? 'N' : '?'; case 'M': return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; case 'O': return ("NOTATION".equals(str) == true) ? 'o' : '?'; case 'E': return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; default: break; } break; default: break; } return '?'; } /** * Reads a single or double quotted string in to the buffer. * * This method resolves entities inside a string unless the parser parses * DTD. * * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - * not an attribute value; 'd' - in DTD context. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void bqstr(char flag) throws Exception { Input inp = mInp; // remember the original input mBuffIdx = -1; bappend((char) 0); // default offset to the colon char char ch; for (short st = 0; st >= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); switch (st) { case 0: // read a single or double quote switch (ch) { case ' ': case '\n': case '\r': case '\t': break; case '\'': st = 2; // read a single quoted string break; case '\"': st = 3; // read a double quoted string break; default: panic(FAULT); break; } break; case 2: // read a single quoted string case 3: // read a double quoted string switch (ch) { case '\'': if ((st == 2) && (mInp == inp)) { st = -1; } else { bappend(ch); } break; case '\"': if ((st == 3) && (mInp == inp)) { st = -1; } else { bappend(ch); } break; case '&': if (flag != 'd') { ent(flag); } else { bappend(ch); } break; case '%': if (flag == 'd') { pent('-'); } else { bappend(ch); } break; case '<': if ((flag == '-') || (flag == 'd')) { bappend(ch); } else { panic(FAULT); } break; case EOS: // EOS before single/double quote panic(FAULT); case '\r': // EOL processing [#2.11 & #3.3.3] if (flag != ' ' && mInp.next == null) { if (getch() != '\n') { bkch(); } ch = '\n'; } default: bappend(ch, flag); break; } break; default: panic(FAULT); } } // There is maximum one space at the end of the string in // i-mode (non CDATA normalization) and it has to be removed. if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { mBuffIdx -= 1; } } /** * Reports characters and empties the parser's buffer. This method is called * only if parser is going to return control to the main loop. This means * that this method may use parser buffer to report white space without * copying characters to temporary buffer. */ protected abstract void bflash() throws Exception; /** * Reports white space characters and empties the parser's buffer. This * method is called only if parser is going to return control to the main * loop. This means that this method may use parser buffer to report white * space without copying characters to temporary buffer. */ protected abstract void bflash_ws() throws Exception; /** * Appends a character to parser's buffer with normalization. * * @param ch The character to append to the buffer. * @param mode The normalization mode. */ private void bappend(char ch, char mode) { // This implements attribute value normalization as // described in the XML specification [#3.3.3]. switch (mode) { case 'i': // non CDATA normalization switch (ch) { case ' ': case '\n': case '\r': case '\t': if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { bappend(' '); } return; default: break; } break; case 'c': // CDATA normalization switch (ch) { case '\n': case '\r': case '\t': ch = ' '; break; default: break; } break; default: // no normalization break; } mBuffIdx++; if (mBuffIdx < mBuff.length) { mBuff[mBuffIdx] = ch; } else { mBuffIdx--; bappend(ch); } } /** * Appends a character to parser's buffer. * * @param ch The character to append to the buffer. */ private void bappend(char ch) { try { mBuff[++mBuffIdx] = ch; } catch (Exception exp) { // Double the buffer size char buff[] = new char[mBuff.length << 1]; System.arraycopy(mBuff, 0, buff, 0, mBuff.length); mBuff = buff; mBuff[mBuffIdx] = ch; } } /** * Appends (mChIdx - cidx) characters from character buffer (mChars) to * parser's buffer (mBuff). * * @param cidx The character buffer (mChars) start index. * @param bidx The parser buffer (mBuff) start index. */ private void bcopy(int cidx, int bidx) { int length = mChIdx - cidx; if ((bidx + length + 1) >= mBuff.length) { // Expand the buffer char buff[] = new char[mBuff.length + length]; System.arraycopy(mBuff, 0, buff, 0, mBuff.length); mBuff = buff; } System.arraycopy(mChars, cidx, mBuff, bidx, length); mBuffIdx += length; } /** * Recognizes the built-in entities lt, gt, amp, * apos, quot. The initial state is 0x100. Any state belowe * 0x100 is a built-in entity replacement character. * * @param ch the next character of an entity name. */ @SuppressWarnings("fallthrough") private void eappend(char ch) { switch (mESt) { case 0x100: // "l" or "g" or "a" or "q" switch (ch) { case 'l': mESt = 0x101; break; case 'g': mESt = 0x102; break; case 'a': mESt = 0x103; break; case 'q': mESt = 0x107; break; default: mESt = 0x200; break; } break; case 0x101: // "lt" mESt = (ch == 't') ? '<' : (char) 0x200; break; case 0x102: // "gt" mESt = (ch == 't') ? '>' : (char) 0x200; break; case 0x103: // "am" or "ap" switch (ch) { case 'm': mESt = 0x104; break; case 'p': mESt = 0x105; break; default: mESt = 0x200; break; } break; case 0x104: // "amp" mESt = (ch == 'p') ? '&' : (char) 0x200; break; case 0x105: // "apo" mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; break; case 0x106: // "apos" mESt = (ch == 's') ? '\'' : (char) 0x200; break; case 0x107: // "qu" mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; break; case 0x108: // "quo" mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; break; case 0x109: // "quot" mESt = (ch == 't') ? '\"' : (char) 0x200; break; case '<': // "lt" case '>': // "gt" case '&': // "amp" case '\'': // "apos" case '\"': // "quot" mESt = 0x200; default: break; } } /** * Sets up a new input source on the top of the input stack. Note, the first * byte returned by the entity's byte stream has to be the first byte in the * entity. However, the parser does not expect the byte order mask in both * cases when encoding is provided by the input source. * * @param is A new input source to set up. * @exception IOException If any IO errors occur. * @exception Exception is parser specific exception form panic method. */ protected void setinp(InputSource is) throws Exception { Reader reader = null; mChIdx = 0; mChLen = 0; mChars = mInp.chars; mInp.src = null; if (mPh < PH_DOC_START) { mIsSAlone = false; // default [#2.9] } mIsSAloneSet = false; if (is.getCharacterStream() != null) { // Ignore encoding in the xml text decl. reader = is.getCharacterStream(); xml(reader); } else if (is.getByteStream() != null) { String expenc; if (is.getEncoding() != null) { // Ignore encoding in the xml text decl. expenc = is.getEncoding().toUpperCase(); if (expenc.equals("UTF-16")) { reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] } else { reader = enc(expenc, is.getByteStream()); } xml(reader); } else { // Get encoding from BOM or the xml text decl. reader = bom(is.getByteStream(), ' '); /** * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon * that it may be missing. A mature technique exists in Xerces * to further check for possible UTF-16 encoding */ if (reader == null) { reader = utf16(is.getByteStream()); } if (reader == null) { // Encoding is defined by the xml text decl. reader = enc("UTF-8", is.getByteStream()); expenc = xml(reader); if (!expenc.equals("UTF-8")) { if (expenc.startsWith("UTF-16")) { panic(FAULT); // UTF-16 must have BOM [#4.3.3] } reader = enc(expenc, is.getByteStream()); } } else { // Encoding is defined by the BOM. xml(reader); } } } else { // There is no support for public/system identifiers. panic(FAULT); } mInp.src = reader; mInp.pubid = is.getPublicId(); mInp.sysid = is.getSystemId(); } /** * Determines the entity encoding. * * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the * first byte returned by the entity's byte stream has to be the first byte * in the entity. Also, there is no support for UCS-4. * * @param is A byte stream of the entity. * @param hint An encoding hint, character U means UTF-16. * @return a reader constructed from the BOM or UTF-8 by default. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private Reader bom(InputStream is, char hint) throws Exception { int val = is.read(); switch (val) { case 0xef: // UTF-8 if (hint == 'U') // must be UTF-16 { panic(FAULT); } if (is.read() != 0xbb) { panic(FAULT); } if (is.read() != 0xbf) { panic(FAULT); } return new ReaderUTF8(is); case 0xfe: // UTF-16, big-endian if (is.read() != 0xff) { panic(FAULT); } return new ReaderUTF16(is, 'b'); case 0xff: // UTF-16, little-endian if (is.read() != 0xfe) { panic(FAULT); } return new ReaderUTF16(is, 'l'); case -1: mChars[mChIdx++] = EOS; return new ReaderUTF8(is); default: if (hint == 'U') // must be UTF-16 { panic(FAULT); } // Read the rest of UTF-8 character switch (val & 0xf0) { case 0xc0: case 0xd0: mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); break; case 0xe0: mChars[mChIdx++] = (char) (((val & 0x0f) << 12) | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); break; case 0xf0: // UCS-4 character throw new UnsupportedEncodingException(); default: mChars[mChIdx++] = (char) val; break; } return null; } } /** * Using a mature technique from Xerces, this method checks further after * the bom method above to see if the encoding is UTF-16 * * @param is A byte stream of the entity. * @return a reader, may be null * @exception Exception is parser specific exception form panic method. * @exception IOException */ private Reader utf16(InputStream is) throws Exception { if (mChIdx != 0) { //The bom method has read ONE byte into the buffer. byte b0 = (byte)mChars[0]; if (b0 == 0x00 || b0 == 0x3C) { int b1 = is.read(); int b2 = is.read(); int b3 = is.read(); if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM mChars[0] = (char)(b1); mChars[mChIdx++] = (char)(b3); return new ReaderUTF16(is, 'b'); } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM mChars[0] = (char)(b0); mChars[mChIdx++] = (char)(b2); return new ReaderUTF16(is, 'l'); } else { /**not every InputStream supports reset, so we have to remember * the state for further parsing **/ mChars[0] = (char)(b0); mChars[mChIdx++] = (char)(b1); mChars[mChIdx++] = (char)(b2); mChars[mChIdx++] = (char)(b3); } } } return null; } /** * Parses the xml text declaration. * * This method gets encoding from the xml text declaration [#4.3.1] if any. * The method assumes the buffer (mChars) is big enough to accommodate whole * xml text declaration. * * @param reader is entity reader. * @return The xml text declaration encoding or default UTF-8 encoding. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private String xml(Reader reader) throws Exception { String str = null; String enc = "UTF-8"; char ch; int val; short st = 0; int byteRead = mChIdx; //number of bytes read prior to entering this method while (st >= 0 && mChIdx < mChars.length) { if (st < byteRead) { ch = mChars[st]; } else { ch = ((val = reader.read()) >= 0) ? (char) val : EOS; mChars[mChIdx++] = ch; } switch (st) { case 0: // read '<' of xml declaration switch (ch) { case '<': st = 1; break; case 0xfeff: // the byte order mask ch = ((val = reader.read()) >= 0) ? (char) val : EOS; mChars[mChIdx - 1] = ch; st = (short) ((ch == '<') ? 1 : -1); break; default: st = -1; break; } break; case 1: // read '?' of xml declaration [#4.3.1] st = (short) ((ch == '?') ? 2 : -1); break; case 2: // read 'x' of xml declaration [#4.3.1] st = (short) ((ch == 'x') ? 3 : -1); break; case 3: // read 'm' of xml declaration [#4.3.1] st = (short) ((ch == 'm') ? 4 : -1); break; case 4: // read 'l' of xml declaration [#4.3.1] st = (short) ((ch == 'l') ? 5 : -1); break; case 5: // read white space after 'xml' switch (ch) { case ' ': case '\t': case '\r': case '\n': st = 6; break; default: st = -1; break; } break; case 6: // read content of xml declaration switch (ch) { case '?': st = 7; break; case EOS: st = -2; break; default: break; } break; case 7: // read '>' after '?' of xml declaration switch (ch) { case '>': case EOS: st = -2; break; default: st = 6; break; } break; default: panic(FAULT); break; } } mChLen = mChIdx; mChIdx = 0; // If there is no xml text declaration, the encoding is default. if (st == -1) { return enc; } mChIdx = 5; // the first white space after "= 0;) { ch = getch(); switch (st) { case 0: // skip spaces after the xml declaration name if (chtyp(ch) != ' ') { bkch(); st = 1; } break; case 1: // read xml declaration version case 2: // read xml declaration encoding or standalone case 3: // read xml declaration standalone switch (chtyp(ch)) { case 'a': case 'A': case '_': bkch(); str = name(false).toLowerCase(); if ("version".equals(str) == true) { if (st != 1) { panic(FAULT); } if ("1.0".equals(eqstr('=')) != true) { panic(FAULT); } mInp.xmlver = 0x0100; st = 2; } else if ("encoding".equals(str) == true) { if (st != 2) { panic(FAULT); } mInp.xmlenc = eqstr('=').toUpperCase(); enc = mInp.xmlenc; st = 3; } else if ("standalone".equals(str) == true) { if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] { panic(FAULT); } str = eqstr('=').toLowerCase(); // Check the 'standalone' value and use it [#5.1] if (str.equals("yes") == true) { mIsSAlone = true; } else if (str.equals("no") == true) { mIsSAlone = false; } else { panic(FAULT); } mIsSAloneSet = true; st = 4; } else { panic(FAULT); } break; case ' ': break; case '?': if (st == 1) { panic(FAULT); } bkch(); st = 4; break; default: panic(FAULT); } break; case 4: // end of xml declaration switch (chtyp(ch)) { case '?': if (getch() != '>') { panic(FAULT); } if (mPh <= PH_DOC_START) { mPh = PH_MISC_DTD; // misc before DTD } st = -1; break; case ' ': break; default: panic(FAULT); } break; default: panic(FAULT); } } return enc; } /** * Sets up the document reader. * * @param name an encoding name. * @param is the document byte input stream. * @return a reader constructed from encoding name and input stream. * @exception UnsupportedEncodingException */ private Reader enc(String name, InputStream is) throws UnsupportedEncodingException { // DO NOT CLOSE current reader if any! if (name.equals("UTF-8")) { return new ReaderUTF8(is); } else if (name.equals("UTF-16LE")) { return new ReaderUTF16(is, 'l'); } else if (name.equals("UTF-16BE")) { return new ReaderUTF16(is, 'b'); } else { return new InputStreamReader(is, name); } } /** * Sets up current input on the top of the input stack. * * @param inp A new input to set up. */ protected void push(Input inp) { mInp.chLen = mChLen; mInp.chIdx = mChIdx; inp.next = mInp; mInp = inp; mChars = inp.chars; mChLen = inp.chLen; mChIdx = inp.chIdx; } /** * Restores previous input on the top of the input stack. */ protected void pop() { if (mInp.src != null) { try { mInp.src.close(); } catch (IOException ioe) { } mInp.src = null; } mInp = mInp.next; if (mInp != null) { mChars = mInp.chars; mChLen = mInp.chLen; mChIdx = mInp.chIdx; } else { mChars = null; mChLen = 0; mChIdx = 0; } } /** * Maps a character to its type. * * Possible character type values are: * * An ASCII (7 bit) character which does not fall in any category * listed above is mapped to itself. * * @param ch The character to map. * @return The type of character. */ protected char chtyp(char ch) { if (ch < 0x80) { return (char) asctyp[ch]; } return (ch != EOS) ? 'X' : 'Z'; } /** * Retrives the next character in the document. * * @return The next character in the document. */ protected char getch() throws IOException { if (mChIdx >= mChLen) { if (mInp.src == null) { pop(); // remove internal entity return getch(); } // Read new portion of the document characters int Num = mInp.src.read(mChars, 0, mChars.length); if (Num < 0) { if (mInp != mDoc) { pop(); // restore the previous input return getch(); } else { mChars[0] = EOS; mChLen = 1; } } else { mChLen = Num; } mChIdx = 0; } return mChars[mChIdx++]; } /** * Puts back the last read character. * * This method MUST NOT be called more then once after each * call of {@link #getch getch} method. */ protected void bkch() throws Exception { if (mChIdx <= 0) { panic(FAULT); } mChIdx--; } /** * Sets the current character. * * @param ch The character to set. */ protected void setch(char ch) { mChars[mChIdx] = ch; } /** * Finds a pair in the pair chain by a qualified name. * * @param chain The first element of the chain of pairs. * @param qname The qualified name. * @return A pair with the specified qualified name or null. */ protected Pair find(Pair chain, char[] qname) { for (Pair pair = chain; pair != null; pair = pair.next) { if (pair.eqname(qname) == true) { return pair; } } return null; } /** * Provedes an instance of a pair. * * @param next The reference to a next pair. * @return An instance of a pair. */ protected Pair pair(Pair next) { Pair pair; if (mDltd != null) { pair = mDltd; mDltd = pair.next; } else { pair = new Pair(); } pair.next = next; return pair; } /** * Deletes an instance of a pair. * * @param pair The pair to delete. * @return A reference to the next pair in a chain. */ protected Pair del(Pair pair) { Pair next = pair.next; pair.name = null; pair.value = null; pair.chars = null; pair.list = null; pair.next = mDltd; mDltd = pair; return next; } }