1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2005 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 /*
  21  * $Id: WriterToUTF8Buffered.java,v 1.2.4.1 2005/09/15 08:15:31 suresh_emailid Exp $
  22  */
  23 package com.sun.org.apache.xml.internal.serializer;
  24 
  25 import java.io.IOException;
  26 import java.io.OutputStream;
  27 import java.io.UnsupportedEncodingException;
  28 import java.io.Writer;
  29 
  30 
  31 /**
  32  * This class writes unicode characters to a byte stream (java.io.OutputStream)
  33  * as quickly as possible. It buffers the output in an internal
  34  * buffer which must be flushed to the OutputStream when done. This flushing
  35  * is done via the close() flush() or flushBuffer() method.
  36  *
  37  * This class is only used internally within Xalan.
  38  *
  39  * @xsl.usage internal
  40  */
  41 final class WriterToUTF8Buffered extends Writer implements WriterChain
  42 {
  43 
  44   /** number of bytes that the byte buffer can hold.
  45    * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
  46    */
  47   private static final int BYTES_MAX=16*1024;
  48   /** number of characters that the character buffer can hold.
  49    * This is 1/3 of the number of bytes because UTF-8 encoding
  50    * can expand one unicode character by up to 3 bytes.
  51    */
  52   private static final int CHARS_MAX=(BYTES_MAX/3);
  53 
  54  // private static final int
  55 
  56   /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
  57   private final OutputStream m_os;
  58 
  59   /**
  60    * The internal buffer where data is stored.
  61    * (sc & sb remove final to compile in JDK 1.1.8)
  62    */
  63   private final byte m_outputBytes[];
  64 
  65   private final char m_inputChars[];
  66 
  67   /**
  68    * The number of valid bytes in the buffer. This value is always
  69    * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
  70    * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
  71    * byte data.
  72    */
  73   private int count;
  74 
  75   /**
  76    * Create an buffered UTF-8 writer.
  77    *
  78    *
  79    * @param   out    the underlying output stream.
  80    *
  81    * @throws UnsupportedEncodingException
  82    */
  83   public WriterToUTF8Buffered(OutputStream out)
  84           throws UnsupportedEncodingException
  85   {
  86       m_os = out;
  87       // get 3 extra bytes to make buffer overflow checking simpler and faster
  88       // we won't have to keep checking for a few extra characters
  89       m_outputBytes = new byte[BYTES_MAX + 3];
  90 
  91       // Big enough to hold the input chars that will be transformed
  92       // into output bytes in m_ouputBytes.
  93       m_inputChars = new char[CHARS_MAX + 2];
  94       count = 0;
  95 
  96 //      the old body of this constructor, before the buffersize was changed to a constant
  97 //      this(out, 8*1024);
  98   }
  99 
 100   /**
 101    * Create an buffered UTF-8 writer to write data to the
 102    * specified underlying output stream with the specified buffer
 103    * size.
 104    *
 105    * @param   out    the underlying output stream.
 106    * @param   size   the buffer size.
 107    * @exception IllegalArgumentException if size <= 0.
 108    */
 109 //  public WriterToUTF8Buffered(final OutputStream out, final int size)
 110 //  {
 111 //
 112 //    m_os = out;
 113 //
 114 //    if (size <= 0)
 115 //    {
 116 //      throw new IllegalArgumentException(
 117 //        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
 118 //    }
 119 //
 120 //    m_outputBytes = new byte[size];
 121 //    count = 0;
 122 //  }
 123 
 124   /**
 125    * Write a single character.  The character to be written is contained in
 126    * the 16 low-order bits of the given integer value; the 16 high-order bits
 127    * are ignored.
 128    *
 129    * <p> Subclasses that intend to support efficient single-character output
 130    * should override this method.
 131    *
 132    * @param c  int specifying a character to be written.
 133    * @exception  IOException  If an I/O error occurs
 134    */
 135   public void write(final int c) throws IOException
 136   {
 137 
 138     /* If we are close to the end of the buffer then flush it.
 139      * Remember the buffer can hold a few more bytes than BYTES_MAX
 140      */
 141     if (count >= BYTES_MAX)
 142         flushBuffer();
 143 
 144     if (c < 0x80)
 145     {
 146        m_outputBytes[count++] = (byte) (c);
 147     }
 148     else if (c < 0x800)
 149     {
 150       m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
 151       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 152     }
 153     else if (c < 0x10000)
 154     {
 155       m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
 156       m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 157       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 158     }
 159         else
 160         {
 161           m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
 162           m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
 163           m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 164           m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 165         }
 166 
 167   }
 168 
 169 
 170   /**
 171    * Write a portion of an array of characters.
 172    *
 173    * @param  chars  Array of characters
 174    * @param  start   Offset from which to start writing characters
 175    * @param  length   Number of characters to write
 176    *
 177    * @exception  IOException  If an I/O error occurs
 178    *
 179    * @throws java.io.IOException
 180    */
 181   public void write(final char chars[], final int start, final int length)
 182           throws java.io.IOException
 183   {
 184 
 185     // We multiply the length by three since this is the maximum length
 186     // of the characters that we can put into the buffer.  It is possible
 187     // for each Unicode character to expand to three bytes.
 188 
 189     int lengthx3 = 3*length;
 190 
 191     if (lengthx3 >= BYTES_MAX - count)
 192     {
 193       // The requested length is greater than the unused part of the buffer
 194       flushBuffer();
 195 
 196       if (lengthx3 > BYTES_MAX)
 197       {
 198         /*
 199          * The requested length exceeds the size of the buffer.
 200          * Cut the buffer up into chunks, each of which will
 201          * not cause an overflow to the output buffer m_outputBytes,
 202          * and make multiple recursive calls.
 203          * Be careful about integer overflows in multiplication.
 204          */
 205         int split = length/CHARS_MAX;
 206         final int chunks;
 207         if (length % CHARS_MAX > 0)
 208             chunks = split + 1;
 209         else
 210             chunks = split;
 211         int end_chunk = start;
 212         for (int chunk = 1; chunk <= chunks; chunk++)
 213         {
 214             int start_chunk = end_chunk;
 215             end_chunk = start + (int) ((((long) length) * chunk) / chunks);
 216 
 217             // Adjust the end of the chunk if it ends on a high char
 218             // of a Unicode surrogate pair and low char of the pair
 219             // is not going to be in the same chunk
 220             final char c = chars[end_chunk - 1];
 221             int ic = chars[end_chunk - 1];
 222             if (c >= 0xD800 && c <= 0xDBFF) {
 223                 // The last Java char that we were going
 224                 // to process is the first of a
 225                 // Java surrogate char pair that
 226                 // represent a Unicode character.
 227 
 228                 if (end_chunk < start + length) {
 229                     // Avoid spanning by including the low
 230                     // char in the current chunk of chars.
 231                     end_chunk++;
 232                 } else {
 233                     /* This is the last char of the last chunk,
 234                      * and it is the high char of a high/low pair with
 235                      * no low char provided.
 236                      * TODO: error message needed.
 237                      * The char array incorrectly ends in a high char
 238                      * of a high/low surrogate pair, but there is
 239                      * no corresponding low as the high is the last char
 240                      */
 241                     end_chunk--;
 242                 }
 243             }
 244 
 245 
 246             int len_chunk = (end_chunk - start_chunk);
 247             this.write(chars,start_chunk, len_chunk);
 248         }
 249         return;
 250       }
 251     }
 252 
 253 
 254 
 255     final int n = length+start;
 256     final byte[] buf_loc = m_outputBytes; // local reference for faster access
 257     int count_loc = count;      // local integer for faster access
 258     int i = start;
 259     {
 260         /* This block could be omitted and the code would produce
 261          * the same result. But this block exists to give the JIT
 262          * a better chance of optimizing a tight and common loop which
 263          * occurs when writing out ASCII characters.
 264          */
 265         char c;
 266         for(; i < n && (c = chars[i])< 0x80 ; i++ )
 267             buf_loc[count_loc++] = (byte)c;
 268     }
 269     for (; i < n; i++)
 270     {
 271 
 272       final char c = chars[i];
 273 
 274       if (c < 0x80)
 275         buf_loc[count_loc++] = (byte) (c);
 276       else if (c < 0x800)
 277       {
 278         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
 279         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 280       }
 281       /**
 282         * The following else if condition is added to support XML 1.1 Characters for
 283         * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
 284         * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
 285         *          [1101 11yy] [yyxx xxxx] (low surrogate)
 286         *          * uuuuu = wwww + 1
 287         */
 288       else if (c >= 0xD800 && c <= 0xDBFF)
 289       {
 290           char high, low;
 291           high = c;
 292           i++;
 293           low = chars[i];
 294 
 295           buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
 296           buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
 297           buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
 298           buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
 299       }
 300       else
 301       {
 302         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
 303         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 304         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 305       }
 306     }
 307     // Store the local integer back into the instance variable
 308     count = count_loc;
 309 
 310   }
 311 
 312   /**
 313    * Write a string.
 314    *
 315    * @param  s  String to be written
 316    *
 317    * @exception  IOException  If an I/O error occurs
 318    */
 319   public void write(final String s) throws IOException
 320   {
 321 
 322     // We multiply the length by three since this is the maximum length
 323     // of the characters that we can put into the buffer.  It is possible
 324     // for each Unicode character to expand to three bytes.
 325     final int length = s.length();
 326     int lengthx3 = 3*length;
 327 
 328     if (lengthx3 >= BYTES_MAX - count)
 329     {
 330       // The requested length is greater than the unused part of the buffer
 331       flushBuffer();
 332 
 333       if (lengthx3 > BYTES_MAX)
 334       {
 335         /*
 336          * The requested length exceeds the size of the buffer,
 337          * so break it up in chunks that don't exceed the buffer size.
 338          */
 339          final int start = 0;
 340          int split = length/CHARS_MAX;
 341          final int chunks;
 342          if (length % CHARS_MAX > 0)
 343              chunks = split + 1;
 344          else
 345              chunks = split;
 346          int end_chunk = 0;
 347          for (int chunk = 1; chunk <= chunks; chunk++)
 348          {
 349              int start_chunk = end_chunk;
 350              end_chunk = start + (int) ((((long) length) * chunk) / chunks);
 351              s.getChars(start_chunk,end_chunk, m_inputChars,0);
 352              int len_chunk = (end_chunk - start_chunk);
 353 
 354              // Adjust the end of the chunk if it ends on a high char
 355              // of a Unicode surrogate pair and low char of the pair
 356              // is not going to be in the same chunk
 357              final char c = m_inputChars[len_chunk - 1];
 358              if (c >= 0xD800 && c <= 0xDBFF) {
 359                  // Exclude char in this chunk,
 360                  // to avoid spanning a Unicode character
 361                  // that is in two Java chars as a high/low surrogate
 362                  end_chunk--;
 363                  len_chunk--;
 364                  if (chunk == chunks) {
 365                      /* TODO: error message needed.
 366                       * The String incorrectly ends in a high char
 367                       * of a high/low surrogate pair, but there is
 368                       * no corresponding low as the high is the last char
 369                       * Recover by ignoring this last char.
 370                       */
 371                  }
 372              }
 373 
 374              this.write(m_inputChars,0, len_chunk);
 375          }
 376          return;
 377       }
 378     }
 379 
 380 
 381     s.getChars(0, length , m_inputChars, 0);
 382     final char[] chars = m_inputChars;
 383     final int n = length;
 384     final byte[] buf_loc = m_outputBytes; // local reference for faster access
 385     int count_loc = count;      // local integer for faster access
 386     int i = 0;
 387     {
 388         /* This block could be omitted and the code would produce
 389          * the same result. But this block exists to give the JIT
 390          * a better chance of optimizing a tight and common loop which
 391          * occurs when writing out ASCII characters.
 392          */
 393         char c;
 394         for(; i < n && (c = chars[i])< 0x80 ; i++ )
 395             buf_loc[count_loc++] = (byte)c;
 396     }
 397     for (; i < n; i++)
 398     {
 399 
 400       final char c = chars[i];
 401 
 402       if (c < 0x80)
 403         buf_loc[count_loc++] = (byte) (c);
 404       else if (c < 0x800)
 405       {
 406         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
 407         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 408       }
 409     /**
 410       * The following else if condition is added to support XML 1.1 Characters for
 411       * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
 412       * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
 413       *          [1101 11yy] [yyxx xxxx] (low surrogate)
 414       *          * uuuuu = wwww + 1
 415       */
 416     else if (c >= 0xD800 && c <= 0xDBFF)
 417     {
 418         char high, low;
 419         high = c;
 420         i++;
 421         low = chars[i];
 422 
 423         buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
 424         buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
 425         buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
 426         buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
 427     }
 428       else
 429       {
 430         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
 431         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 432         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 433       }
 434     }
 435     // Store the local integer back into the instance variable
 436     count = count_loc;
 437 
 438   }
 439 
 440   /**
 441    * Flush the internal buffer
 442    *
 443    * @throws IOException
 444    */
 445   public void flushBuffer() throws IOException
 446   {
 447 
 448     if (count > 0)
 449     {
 450       m_os.write(m_outputBytes, 0, count);
 451 
 452       count = 0;
 453     }
 454   }
 455 
 456   /**
 457    * Flush the stream.  If the stream has saved any characters from the
 458    * various write() methods in a buffer, write them immediately to their
 459    * intended destination.  Then, if that destination is another character or
 460    * byte stream, flush it.  Thus one flush() invocation will flush all the
 461    * buffers in a chain of Writers and OutputStreams.
 462    *
 463    * @exception  IOException  If an I/O error occurs
 464    *
 465    * @throws java.io.IOException
 466    */
 467   public void flush() throws java.io.IOException
 468   {
 469     flushBuffer();
 470     m_os.flush();
 471   }
 472 
 473   /**
 474    * Close the stream, flushing it first.  Once a stream has been closed,
 475    * further write() or flush() invocations will cause an IOException to be
 476    * thrown.  Closing a previously-closed stream, however, has no effect.
 477    *
 478    * @exception  IOException  If an I/O error occurs
 479    *
 480    * @throws java.io.IOException
 481    */
 482   public void close() throws java.io.IOException
 483   {
 484     flushBuffer();
 485     m_os.close();
 486   }
 487 
 488   /**
 489    * Get the output stream where the events will be serialized to.
 490    *
 491    * @return reference to the result stream, or null of only a writer was
 492    * set.
 493    */
 494   public OutputStream getOutputStream()
 495   {
 496     return m_os;
 497   }
 498 
 499   public Writer getWriter()
 500   {
 501     // Only one of getWriter() or getOutputStream() can return null
 502     // This type of writer wraps an OutputStream, not a Writer.
 503     return null;
 504   }
 505 }