New src/com/sun/org/apache/xml/internal/serializer/WriterToUTF8Buffered.java

   1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Licensed to the Apache Software Foundation (ASF) under one
   7  * or more contributor license agreements. See the NOTICE file
   8  * distributed with this work for additional information
   9  * regarding copyright ownership. The ASF licenses this file
  10  * to you under the Apache License, Version 2.0 (the  "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * Unless required by applicable law or agreed to in writing, software
  17  * distributed under the License is distributed on an "AS IS" BASIS,
  18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19  * See the License for the specific language governing permissions and
  20  * limitations under the License.
  21  */
  22 /*
  23  * $Id: WriterToUTF8Buffered.java,v 1.2.4.1 2005/09/15 08:15:31 suresh_emailid Exp $
  24  */
  25 package com.sun.org.apache.xml.internal.serializer;
  26 
  27 import java.io.IOException;
  28 import java.io.OutputStream;
  29 import java.io.UnsupportedEncodingException;
  30 import java.io.Writer;
  31 
  32 
  33 /**
  34  * This class writes unicode characters to a byte stream (java.io.OutputStream)
  35  * as quickly as possible. It buffers the output in an internal
  36  * buffer which must be flushed to the OutputStream when done. This flushing
  37  * is done via the close() flush() or flushBuffer() method.
  38  *
  39  * This class is only used internally within Xalan.
  40  *
  41  * @xsl.usage internal
  42  */
  43 final class WriterToUTF8Buffered extends Writer implements WriterChain
  44 {
  45 
  46   /** number of bytes that the byte buffer can hold.
  47    * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
  48    */
  49   private static final int BYTES_MAX=16*1024;
  50   /** number of characters that the character buffer can hold.
  51    * This is 1/3 of the number of bytes because UTF-8 encoding
  52    * can expand one unicode character by up to 3 bytes.
  53    */
  54   private static final int CHARS_MAX=(BYTES_MAX/3);
  55 
  56  // private static final int
  57 
  58   /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
  59   private final OutputStream m_os;
  60 
  61   /**
  62    * The internal buffer where data is stored.
  63    * (sc & sb remove final to compile in JDK 1.1.8)
  64    */
  65   private final byte m_outputBytes[];
  66 
  67   private final char m_inputChars[];
  68 
  69   /**
  70    * The number of valid bytes in the buffer. This value is always
  71    * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
  72    * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
  73    * byte data.
  74    */
  75   private int count;
  76 
  77   /**
  78    * Create an buffered UTF-8 writer.
  79    *
  80    *
  81    * @param   out    the underlying output stream.
  82    *
  83    * @throws UnsupportedEncodingException
  84    */
  85   public WriterToUTF8Buffered(OutputStream out)
  86           throws UnsupportedEncodingException
  87   {
  88       m_os = out;
  89       // get 3 extra bytes to make buffer overflow checking simpler and faster
  90       // we won't have to keep checking for a few extra characters
  91       m_outputBytes = new byte[BYTES_MAX + 3];
  92 
  93       // Big enough to hold the input chars that will be transformed
  94       // into output bytes in m_ouputBytes.
  95       m_inputChars = new char[CHARS_MAX + 2];
  96       count = 0;
  97 
  98 //      the old body of this constructor, before the buffersize was changed to a constant
  99 //      this(out, 8*1024);
 100   }
 101 
 102   /**
 103    * Create an buffered UTF-8 writer to write data to the
 104    * specified underlying output stream with the specified buffer
 105    * size.
 106    *
 107    * @param   out    the underlying output stream.
 108    * @param   size   the buffer size.
 109    * @exception IllegalArgumentException if size <= 0.
 110    */
 111 //  public WriterToUTF8Buffered(final OutputStream out, final int size)
 112 //  {
 113 //
 114 //    m_os = out;
 115 //
 116 //    if (size <= 0)
 117 //    {
 118 //      throw new IllegalArgumentException(
 119 //        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
 120 //    }
 121 //
 122 //    m_outputBytes = new byte[size];
 123 //    count = 0;
 124 //  }
 125 
 126   /**
 127    * Write a single character.  The character to be written is contained in
 128    * the 16 low-order bits of the given integer value; the 16 high-order bits
 129    * are ignored.
 130    *
 131    * <p> Subclasses that intend to support efficient single-character output
 132    * should override this method.
 133    *
 134    * @param c  int specifying a character to be written.
 135    * @exception  IOException  If an I/O error occurs
 136    */
 137   public void write(final int c) throws IOException
 138   {
 139 
 140     /* If we are close to the end of the buffer then flush it.
 141      * Remember the buffer can hold a few more bytes than BYTES_MAX
 142      */
 143     if (count >= BYTES_MAX)
 144         flushBuffer();
 145 
 146     if (c < 0x80)
 147     {
 148        m_outputBytes[count++] = (byte) (c);
 149     }
 150     else if (c < 0x800)
 151     {
 152       m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
 153       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 154     }
 155     else if (c < 0x10000)
 156     {
 157       m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
 158       m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 159       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 160     }
 161         else
 162         {
 163           m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
 164           m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
 165           m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 166           m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
 167         }
 168 
 169   }
 170 
 171 
 172   /**
 173    * Write a portion of an array of characters.
 174    *
 175    * @param  chars  Array of characters
 176    * @param  start   Offset from which to start writing characters
 177    * @param  length   Number of characters to write
 178    *
 179    * @exception  IOException  If an I/O error occurs
 180    *
 181    * @throws java.io.IOException
 182    */
 183   public void write(final char chars[], final int start, final int length)
 184           throws java.io.IOException
 185   {
 186 
 187     // We multiply the length by three since this is the maximum length
 188     // of the characters that we can put into the buffer.  It is possible
 189     // for each Unicode character to expand to three bytes.
 190 
 191     int lengthx3 = 3*length;
 192 
 193     if (lengthx3 >= BYTES_MAX - count)
 194     {
 195       // The requested length is greater than the unused part of the buffer
 196       flushBuffer();
 197 
 198       if (lengthx3 > BYTES_MAX)
 199       {
 200         /*
 201          * The requested length exceeds the size of the buffer.
 202          * Cut the buffer up into chunks, each of which will
 203          * not cause an overflow to the output buffer m_outputBytes,
 204          * and make multiple recursive calls.
 205          * Be careful about integer overflows in multiplication.
 206          */
 207         int split = length/CHARS_MAX;
 208         final int chunks;
 209         if (length % CHARS_MAX > 0)
 210             chunks = split + 1;
 211         else
 212             chunks = split;
 213         int end_chunk = start;
 214         for (int chunk = 1; chunk <= chunks; chunk++)
 215         {
 216             int start_chunk = end_chunk;
 217             end_chunk = start + (int) ((((long) length) * chunk) / chunks);
 218 
 219             // Adjust the end of the chunk if it ends on a high char
 220             // of a Unicode surrogate pair and low char of the pair
 221             // is not going to be in the same chunk
 222             final char c = chars[end_chunk - 1];
 223             int ic = chars[end_chunk - 1];
 224             if (c >= 0xD800 && c <= 0xDBFF) {
 225                 // The last Java char that we were going
 226                 // to process is the first of a
 227                 // Java surrogate char pair that
 228                 // represent a Unicode character.
 229 
 230                 if (end_chunk < start + length) {
 231                     // Avoid spanning by including the low
 232                     // char in the current chunk of chars.
 233                     end_chunk++;
 234                 } else {
 235                     /* This is the last char of the last chunk,
 236                      * and it is the high char of a high/low pair with
 237                      * no low char provided.
 238                      * TODO: error message needed.
 239                      * The char array incorrectly ends in a high char
 240                      * of a high/low surrogate pair, but there is
 241                      * no corresponding low as the high is the last char
 242                      */
 243                     end_chunk--;
 244                 }
 245             }
 246 
 247 
 248             int len_chunk = (end_chunk - start_chunk);
 249             this.write(chars,start_chunk, len_chunk);
 250         }
 251         return;
 252       }
 253     }
 254 
 255 
 256 
 257     final int n = length+start;
 258     final byte[] buf_loc = m_outputBytes; // local reference for faster access
 259     int count_loc = count;      // local integer for faster access
 260     int i = start;
 261     {
 262         /* This block could be omitted and the code would produce
 263          * the same result. But this block exists to give the JIT
 264          * a better chance of optimizing a tight and common loop which
 265          * occurs when writing out ASCII characters.
 266          */
 267         char c;
 268         for(; i < n && (c = chars[i])< 0x80 ; i++ )
 269             buf_loc[count_loc++] = (byte)c;
 270     }
 271     for (; i < n; i++)
 272     {
 273 
 274       final char c = chars[i];
 275 
 276       if (c < 0x80)
 277         buf_loc[count_loc++] = (byte) (c);
 278       else if (c < 0x800)
 279       {
 280         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
 281         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 282       }
 283       /**
 284         * The following else if condition is added to support XML 1.1 Characters for
 285         * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
 286         * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
 287         *          [1101 11yy] [yyxx xxxx] (low surrogate)
 288         *          * uuuuu = wwww + 1
 289         */
 290       else if (c >= 0xD800 && c <= 0xDBFF)
 291       {
 292           char high, low;
 293           high = c;
 294           i++;
 295           low = chars[i];
 296 
 297           buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
 298           buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
 299           buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
 300           buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
 301       }
 302       else
 303       {
 304         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
 305         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 306         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 307       }
 308     }
 309     // Store the local integer back into the instance variable
 310     count = count_loc;
 311 
 312   }
 313 
 314   /**
 315    * Write a string.
 316    *
 317    * @param  s  String to be written
 318    *
 319    * @exception  IOException  If an I/O error occurs
 320    */
 321   public void write(final String s) throws IOException
 322   {
 323 
 324     // We multiply the length by three since this is the maximum length
 325     // of the characters that we can put into the buffer.  It is possible
 326     // for each Unicode character to expand to three bytes.
 327     final int length = s.length();
 328     int lengthx3 = 3*length;
 329 
 330     if (lengthx3 >= BYTES_MAX - count)
 331     {
 332       // The requested length is greater than the unused part of the buffer
 333       flushBuffer();
 334 
 335       if (lengthx3 > BYTES_MAX)
 336       {
 337         /*
 338          * The requested length exceeds the size of the buffer,
 339          * so break it up in chunks that don't exceed the buffer size.
 340          */
 341          final int start = 0;
 342          int split = length/CHARS_MAX;
 343          final int chunks;
 344          if (length % CHARS_MAX > 0)
 345              chunks = split + 1;
 346          else
 347              chunks = split;
 348          int end_chunk = 0;
 349          for (int chunk = 1; chunk <= chunks; chunk++)
 350          {
 351              int start_chunk = end_chunk;
 352              end_chunk = start + (int) ((((long) length) * chunk) / chunks);
 353              s.getChars(start_chunk,end_chunk, m_inputChars,0);
 354              int len_chunk = (end_chunk - start_chunk);
 355 
 356              // Adjust the end of the chunk if it ends on a high char
 357              // of a Unicode surrogate pair and low char of the pair
 358              // is not going to be in the same chunk
 359              final char c = m_inputChars[len_chunk - 1];
 360              if (c >= 0xD800 && c <= 0xDBFF) {
 361                  // Exclude char in this chunk,
 362                  // to avoid spanning a Unicode character
 363                  // that is in two Java chars as a high/low surrogate
 364                  end_chunk--;
 365                  len_chunk--;
 366                  if (chunk == chunks) {
 367                      /* TODO: error message needed.
 368                       * The String incorrectly ends in a high char
 369                       * of a high/low surrogate pair, but there is
 370                       * no corresponding low as the high is the last char
 371                       * Recover by ignoring this last char.
 372                       */
 373                  }
 374              }
 375 
 376              this.write(m_inputChars,0, len_chunk);
 377          }
 378          return;
 379       }
 380     }
 381 
 382 
 383     s.getChars(0, length , m_inputChars, 0);
 384     final char[] chars = m_inputChars;
 385     final int n = length;
 386     final byte[] buf_loc = m_outputBytes; // local reference for faster access
 387     int count_loc = count;      // local integer for faster access
 388     int i = 0;
 389     {
 390         /* This block could be omitted and the code would produce
 391          * the same result. But this block exists to give the JIT
 392          * a better chance of optimizing a tight and common loop which
 393          * occurs when writing out ASCII characters.
 394          */
 395         char c;
 396         for(; i < n && (c = chars[i])< 0x80 ; i++ )
 397             buf_loc[count_loc++] = (byte)c;
 398     }
 399     for (; i < n; i++)
 400     {
 401 
 402       final char c = chars[i];
 403 
 404       if (c < 0x80)
 405         buf_loc[count_loc++] = (byte) (c);
 406       else if (c < 0x800)
 407       {
 408         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
 409         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 410       }
 411     /**
 412       * The following else if condition is added to support XML 1.1 Characters for
 413       * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
 414       * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
 415       *          [1101 11yy] [yyxx xxxx] (low surrogate)
 416       *          * uuuuu = wwww + 1
 417       */
 418     else if (c >= 0xD800 && c <= 0xDBFF)
 419     {
 420         char high, low;
 421         high = c;
 422         i++;
 423         low = chars[i];
 424 
 425         buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
 426         buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
 427         buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
 428         buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
 429     }
 430       else
 431       {
 432         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
 433         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
 434         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
 435       }
 436     }
 437     // Store the local integer back into the instance variable
 438     count = count_loc;
 439 
 440   }
 441 
 442   /**
 443    * Flush the internal buffer
 444    *
 445    * @throws IOException
 446    */
 447   public void flushBuffer() throws IOException
 448   {
 449 
 450     if (count > 0)
 451     {
 452       m_os.write(m_outputBytes, 0, count);
 453 
 454       count = 0;
 455     }
 456   }
 457 
 458   /**
 459    * Flush the stream.  If the stream has saved any characters from the
 460    * various write() methods in a buffer, write them immediately to their
 461    * intended destination.  Then, if that destination is another character or
 462    * byte stream, flush it.  Thus one flush() invocation will flush all the
 463    * buffers in a chain of Writers and OutputStreams.
 464    *
 465    * @exception  IOException  If an I/O error occurs
 466    *
 467    * @throws java.io.IOException
 468    */
 469   public void flush() throws java.io.IOException
 470   {
 471     flushBuffer();
 472     m_os.flush();
 473   }
 474 
 475   /**
 476    * Close the stream, flushing it first.  Once a stream has been closed,
 477    * further write() or flush() invocations will cause an IOException to be
 478    * thrown.  Closing a previously-closed stream, however, has no effect.
 479    *
 480    * @exception  IOException  If an I/O error occurs
 481    *
 482    * @throws java.io.IOException
 483    */
 484   public void close() throws java.io.IOException
 485   {
 486     flushBuffer();
 487     m_os.close();
 488   }
 489 
 490   /**
 491    * Get the output stream where the events will be serialized to.
 492    *
 493    * @return reference to the result stream, or null of only a writer was
 494    * set.
 495    */
 496   public OutputStream getOutputStream()
 497   {
 498     return m_os;
 499   }
 500 
 501   public Writer getWriter()
 502   {
 503     // Only one of getWriter() or getOutputStream() can return null
 504     // This type of writer wraps an OutputStream, not a Writer.
 505     return null;
 506   }
 507 }