1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2004 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.regexp.internal;
  22 
  23 import java.io.Serializable;
  24 
  25 /**
  26  * A class that holds compiled regular expressions.  This is exposed mainly
  27  * for use by the recompile utility (which helps you produce precompiled
  28  * REProgram objects). You should not otherwise need to work directly with
  29  * this class.
  30 *
  31  * @see RE
  32  * @see RECompiler
  33  *
  34  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
  35  */
  36 public class REProgram implements Serializable
  37 {
  38     static final int OPT_HASBACKREFS = 1;
  39 
  40     char[] instruction;         // The compiled regular expression 'program'
  41     int lenInstruction;         // The amount of the instruction buffer in use
  42     char[] prefix;              // Prefix string optimization
  43     int flags;                  // Optimization flags (REProgram.OPT_*)
  44     int maxParens = -1;
  45 
  46     /**
  47      * Constructs a program object from a character array
  48      * @param instruction Character array with RE opcode instructions in it
  49      */
  50     public REProgram(char[] instruction)
  51     {
  52         this(instruction, instruction.length);
  53     }
  54 
  55     /**
  56      * Constructs a program object from a character array
  57      * @param parens Count of parens in the program
  58      * @param instruction Character array with RE opcode instructions in it
  59      */
  60     public REProgram(int parens, char[] instruction)
  61     {
  62         this(instruction, instruction.length);
  63         this.maxParens = parens;
  64     }
  65 
  66     /**
  67      * Constructs a program object from a character array
  68      * @param instruction Character array with RE opcode instructions in it
  69      * @param lenInstruction Amount of instruction array in use
  70      */
  71     public REProgram(char[] instruction, int lenInstruction)
  72     {
  73         setInstructions(instruction, lenInstruction);
  74     }
  75 
  76     /**
  77      * Returns a copy of the current regular expression program in a character
  78      * array that is exactly the right length to hold the program.  If there is
  79      * no program compiled yet, getInstructions() will return null.
  80      * @return A copy of the current compiled RE program
  81      */
  82     public char[] getInstructions()
  83     {
  84         // Ensure program has been compiled!
  85         if (lenInstruction != 0)
  86         {
  87             // Return copy of program
  88             char[] ret = new char[lenInstruction];
  89             System.arraycopy(instruction, 0, ret, 0, lenInstruction);
  90             return ret;
  91         }
  92         return null;
  93     }
  94 
  95     /**
  96      * Sets a new regular expression program to run.  It is this method which
  97      * performs any special compile-time search optimizations.  Currently only
  98      * two optimizations are in place - one which checks for backreferences
  99      * (so that they can be lazily allocated) and another which attempts to
 100      * find an prefix anchor string so that substantial amounts of input can
 101      * potentially be skipped without running the actual program.
 102      * @param instruction Program instruction buffer
 103      * @param lenInstruction Length of instruction buffer in use
 104      */
 105     public void setInstructions(char[] instruction, int lenInstruction)
 106     {
 107         // Save reference to instruction array
 108         this.instruction = instruction;
 109         this.lenInstruction = lenInstruction;
 110 
 111         // Initialize other program-related variables
 112         flags = 0;
 113         prefix = null;
 114 
 115         // Try various compile-time optimizations if there's a program
 116         if (instruction != null && lenInstruction != 0)
 117         {
 118             // If the first node is a branch
 119             if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH)
 120             {
 121                 // to the end node
 122                 int next = instruction[0 + RE.offsetNext];
 123                 if (instruction[next + RE.offsetOpcode] == RE.OP_END)
 124                 {
 125                     // and the branch starts with an atom
 126                     if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM)
 127                     {
 128                         // then get that atom as an prefix because there's no other choice
 129                         int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata];
 130                         prefix = new char[lenAtom];
 131                         System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom);
 132                     }
 133                 }
 134             }
 135 
 136             BackrefScanLoop:
 137 
 138             // Check for backreferences
 139             for (int i = 0; i < lenInstruction; i += RE.nodeSize)
 140             {
 141                 switch (instruction[i + RE.offsetOpcode])
 142                 {
 143                     case RE.OP_ANYOF:
 144                         i += (instruction[i + RE.offsetOpdata] * 2);
 145                         break;
 146 
 147                     case RE.OP_ATOM:
 148                         i += instruction[i + RE.offsetOpdata];
 149                         break;
 150 
 151                     case RE.OP_BACKREF:
 152                         flags |= OPT_HASBACKREFS;
 153                         break BackrefScanLoop;
 154                 }
 155             }
 156         }
 157     }
 158 }