1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.activation.registries;
  27 
  28 /**
  29  *      A tokenizer for strings in the form of "foo/bar; prop1=val1; ... ".
  30  *      Useful for parsing MIME content types.
  31  */
  32 public class MailcapTokenizer {
  33 
  34     public static final int UNKNOWN_TOKEN = 0;
  35     public static final int START_TOKEN = 1;
  36     public static final int STRING_TOKEN = 2;
  37     public static final int EOI_TOKEN = 5;
  38     public static final int SLASH_TOKEN = '/';
  39     public static final int SEMICOLON_TOKEN = ';';
  40     public static final int EQUALS_TOKEN = '=';
  41 
  42     /**
  43      *  Constructor
  44      *
  45      *  @parameter  inputString the string to tokenize
  46      */
  47     public MailcapTokenizer(String inputString) {
  48         data = inputString;
  49         dataIndex = 0;
  50         dataLength = inputString.length();
  51 
  52         currentToken = START_TOKEN;
  53         currentTokenValue = "";
  54 
  55         isAutoquoting = false;
  56         autoquoteChar = ';';
  57     }
  58 
  59     /**
  60      *  Set whether auto-quoting is on or off.
  61      *
  62      *  Auto-quoting means that all characters after the first
  63      *  non-whitespace, non-control character up to the auto-quote
  64      *  terminator character or EOI (minus any whitespace immediatley
  65      *  preceeding it) is considered a token.
  66      *
  67      *  This is required for handling command strings in a mailcap entry.
  68      */
  69     public void setIsAutoquoting(boolean value) {
  70         isAutoquoting = value;
  71     }
  72 
  73     /**
  74      *  Retrieve current token.
  75      *
  76      *  @returns    The current token value
  77      */
  78     public int getCurrentToken() {
  79         return currentToken;
  80     }
  81 
  82     /*
  83      *  Get a String that describes the given token.
  84      */
  85     public static String nameForToken(int token) {
  86         String name = "really unknown";
  87 
  88         switch(token) {
  89             case UNKNOWN_TOKEN:
  90                 name = "unknown";
  91                 break;
  92             case START_TOKEN:
  93                 name = "start";
  94                 break;
  95             case STRING_TOKEN:
  96                 name = "string";
  97                 break;
  98             case EOI_TOKEN:
  99                 name = "EOI";
 100                 break;
 101             case SLASH_TOKEN:
 102                 name = "'/'";
 103                 break;
 104             case SEMICOLON_TOKEN:
 105                 name = "';'";
 106                 break;
 107             case EQUALS_TOKEN:
 108                 name = "'='";
 109                 break;
 110         }
 111 
 112         return name;
 113     }
 114 
 115     /*
 116      *  Retrieve current token value.
 117      *
 118      *  @returns    A String containing the current token value
 119      */
 120     public String getCurrentTokenValue() {
 121         return currentTokenValue;
 122     }
 123     /*
 124      *  Process the next token.
 125      *
 126      *  @returns    the next token
 127      */
 128     public int nextToken() {
 129         if (dataIndex < dataLength) {
 130             //  skip white space
 131             while ((dataIndex < dataLength) &&
 132                     (isWhiteSpaceChar(data.charAt(dataIndex)))) {
 133                 ++dataIndex;
 134             }
 135 
 136             if (dataIndex < dataLength) {
 137                 //  examine the current character and see what kind of token we have
 138                 char c = data.charAt(dataIndex);
 139                 if (isAutoquoting) {
 140                     if (c == ';' || c == '=') {
 141                         currentToken = c;
 142                         currentTokenValue = new Character(c).toString();
 143                         ++dataIndex;
 144                     } else {
 145                         processAutoquoteToken();
 146                     }
 147                 } else {
 148                     if (isStringTokenChar(c)) {
 149                         processStringToken();
 150                     } else if ((c == '/') || (c == ';') || (c == '=')) {
 151                         currentToken = c;
 152                         currentTokenValue = new Character(c).toString();
 153                         ++dataIndex;
 154                     } else {
 155                         currentToken = UNKNOWN_TOKEN;
 156                         currentTokenValue = new Character(c).toString();
 157                         ++dataIndex;
 158                     }
 159                 }
 160             } else {
 161                 currentToken = EOI_TOKEN;
 162                 currentTokenValue = null;
 163             }
 164         } else {
 165             currentToken = EOI_TOKEN;
 166             currentTokenValue = null;
 167         }
 168 
 169         return currentToken;
 170     }
 171 
 172     private void processStringToken() {
 173         //  capture the initial index
 174         int initialIndex = dataIndex;
 175 
 176         //  skip to 1st non string token character
 177         while ((dataIndex < dataLength) &&
 178                 isStringTokenChar(data.charAt(dataIndex))) {
 179             ++dataIndex;
 180         }
 181 
 182         currentToken = STRING_TOKEN;
 183         currentTokenValue = data.substring(initialIndex, dataIndex);
 184     }
 185 
 186     private void processAutoquoteToken() {
 187         //  capture the initial index
 188         int initialIndex = dataIndex;
 189 
 190         //  now skip to the 1st non-escaped autoquote termination character
 191         //  XXX - doesn't actually consider escaping
 192         boolean foundTerminator = false;
 193         while ((dataIndex < dataLength) && !foundTerminator) {
 194             char c = data.charAt(dataIndex);
 195             if (c != autoquoteChar) {
 196                 ++dataIndex;
 197             } else {
 198                 foundTerminator = true;
 199             }
 200         }
 201 
 202         currentToken = STRING_TOKEN;
 203         currentTokenValue =
 204             fixEscapeSequences(data.substring(initialIndex, dataIndex));
 205     }
 206 
 207     private static boolean isSpecialChar(char c) {
 208         boolean lAnswer = false;
 209 
 210         switch(c) {
 211             case '(':
 212             case ')':
 213             case '<':
 214             case '>':
 215             case '@':
 216             case ',':
 217             case ';':
 218             case ':':
 219             case '\\':
 220             case '"':
 221             case '/':
 222             case '[':
 223             case ']':
 224             case '?':
 225             case '=':
 226                 lAnswer = true;
 227                 break;
 228         }
 229 
 230         return lAnswer;
 231     }
 232 
 233     private static boolean isControlChar(char c) {
 234         return Character.isISOControl(c);
 235     }
 236 
 237     private static boolean isWhiteSpaceChar(char c) {
 238         return Character.isWhitespace(c);
 239     }
 240 
 241     private static boolean isStringTokenChar(char c) {
 242         return !isSpecialChar(c) && !isControlChar(c) && !isWhiteSpaceChar(c);
 243     }
 244 
 245     private static String fixEscapeSequences(String inputString) {
 246         int inputLength = inputString.length();
 247         StringBuffer buffer = new StringBuffer();
 248         buffer.ensureCapacity(inputLength);
 249 
 250         for (int i = 0; i < inputLength; ++i) {
 251             char currentChar = inputString.charAt(i);
 252             if (currentChar != '\\') {
 253                 buffer.append(currentChar);
 254             } else {
 255                 if (i < inputLength - 1) {
 256                     char nextChar = inputString.charAt(i + 1);
 257                     buffer.append(nextChar);
 258 
 259                     //  force a skip over the next character too
 260                     ++i;
 261                 } else {
 262                     buffer.append(currentChar);
 263                 }
 264             }
 265         }
 266 
 267         return buffer.toString();
 268     }
 269 
 270     private String  data;
 271     private int     dataIndex;
 272     private int     dataLength;
 273     private int     currentToken;
 274     private String  currentTokenValue;
 275     private boolean isAutoquoting;
 276     private char    autoquoteChar;
 277 
 278     /*
 279     public static void main(String[] args) {
 280         for (int i = 0; i < args.length; ++i) {
 281             MailcapTokenizer tokenizer = new MailcapTokenizer(args[i]);
 282 
 283             System.out.println("Original: |" + args[i] + "|");
 284 
 285             int currentToken = tokenizer.nextToken();
 286             while (currentToken != EOI_TOKEN) {
 287                 switch(currentToken) {
 288                     case UNKNOWN_TOKEN:
 289                         System.out.println("  Unknown Token:           |" + tokenizer.getCurrentTokenValue() + "|");
 290                         break;
 291                     case START_TOKEN:
 292                         System.out.println("  Start Token:             |" + tokenizer.getCurrentTokenValue() + "|");
 293                         break;
 294                     case STRING_TOKEN:
 295                         System.out.println("  String Token:            |" + tokenizer.getCurrentTokenValue() + "|");
 296                         break;
 297                     case EOI_TOKEN:
 298                         System.out.println("  EOI Token:               |" + tokenizer.getCurrentTokenValue() + "|");
 299                         break;
 300                     case SLASH_TOKEN:
 301                         System.out.println("  Slash Token:             |" + tokenizer.getCurrentTokenValue() + "|");
 302                         break;
 303                     case SEMICOLON_TOKEN:
 304                         System.out.println("  Semicolon Token:         |" + tokenizer.getCurrentTokenValue() + "|");
 305                         break;
 306                     case EQUALS_TOKEN:
 307                         System.out.println("  Equals Token:            |" + tokenizer.getCurrentTokenValue() + "|");
 308                         break;
 309                     default:
 310                         System.out.println("  Really Unknown Token:    |" + tokenizer.getCurrentTokenValue() + "|");
 311                         break;
 312                 }
 313 
 314                 currentToken = tokenizer.nextToken();
 315             }
 316 
 317             System.out.println("");
 318         }
 319     }
 320     */
 321 }