1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2005 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 /* 21 * $Id: WriterToUTF8Buffered.java,v 1.2.4.1 2005/09/15 08:15:31 suresh_emailid Exp $ 22 */ 23 package com.sun.org.apache.xml.internal.serializer; 24 25 import java.io.IOException; 26 import java.io.OutputStream; 27 import java.io.UnsupportedEncodingException; 28 import java.io.Writer; 29 30 31 /** 32 * This class writes unicode characters to a byte stream (java.io.OutputStream) 33 * as quickly as possible. It buffers the output in an internal 34 * buffer which must be flushed to the OutputStream when done. This flushing 35 * is done via the close() flush() or flushBuffer() method. 36 * 37 * This class is only used internally within Xalan. 38 * 39 * @xsl.usage internal 40 */ 41 final class WriterToUTF8Buffered extends Writer implements WriterChain 42 { 43 44 /** number of bytes that the byte buffer can hold. 45 * This is a fixed constant is used rather than m_outputBytes.lenght for performance. 46 */ 47 private static final int BYTES_MAX=16*1024; 48 /** number of characters that the character buffer can hold. 49 * This is 1/3 of the number of bytes because UTF-8 encoding 50 * can expand one unicode character by up to 3 bytes. 51 */ 52 private static final int CHARS_MAX=(BYTES_MAX/3); 53 54 // private static final int 55 56 /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */ 57 private final OutputStream m_os; 58 59 /** 60 * The internal buffer where data is stored. 61 * (sc & sb remove final to compile in JDK 1.1.8) 62 */ 63 private final byte m_outputBytes[]; 64 65 private final char m_inputChars[]; 66 67 /** 68 * The number of valid bytes in the buffer. This value is always 69 * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements 70 * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid 71 * byte data. 72 */ 73 private int count; 74 75 /** 76 * Create an buffered UTF-8 writer. 77 * 78 * 79 * @param out the underlying output stream. 80 * 81 * @throws UnsupportedEncodingException 82 */ 83 public WriterToUTF8Buffered(OutputStream out) 84 throws UnsupportedEncodingException 85 { 86 m_os = out; 87 // get 3 extra bytes to make buffer overflow checking simpler and faster 88 // we won't have to keep checking for a few extra characters 89 m_outputBytes = new byte[BYTES_MAX + 3]; 90 91 // Big enough to hold the input chars that will be transformed 92 // into output bytes in m_ouputBytes. 93 m_inputChars = new char[CHARS_MAX + 2]; 94 count = 0; 95 96 // the old body of this constructor, before the buffersize was changed to a constant 97 // this(out, 8*1024); 98 } 99 100 /** 101 * Create an buffered UTF-8 writer to write data to the 102 * specified underlying output stream with the specified buffer 103 * size. 104 * 105 * @param out the underlying output stream. 106 * @param size the buffer size. 107 * @exception IllegalArgumentException if size <= 0. 108 */ 109 // public WriterToUTF8Buffered(final OutputStream out, final int size) 110 // { 111 // 112 // m_os = out; 113 // 114 // if (size <= 0) 115 // { 116 // throw new IllegalArgumentException( 117 // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0"); 118 // } 119 // 120 // m_outputBytes = new byte[size]; 121 // count = 0; 122 // } 123 124 /** 125 * Write a single character. The character to be written is contained in 126 * the 16 low-order bits of the given integer value; the 16 high-order bits 127 * are ignored. 128 * 129 * <p> Subclasses that intend to support efficient single-character output 130 * should override this method. 131 * 132 * @param c int specifying a character to be written. 133 * @exception IOException If an I/O error occurs 134 */ 135 public void write(final int c) throws IOException 136 { 137 138 /* If we are close to the end of the buffer then flush it. 139 * Remember the buffer can hold a few more bytes than BYTES_MAX 140 */ 141 if (count >= BYTES_MAX) 142 flushBuffer(); 143 144 if (c < 0x80) 145 { 146 m_outputBytes[count++] = (byte) (c); 147 } 148 else if (c < 0x800) 149 { 150 m_outputBytes[count++] = (byte) (0xc0 + (c >> 6)); 151 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 152 } 153 else if (c < 0x10000) 154 { 155 m_outputBytes[count++] = (byte) (0xe0 + (c >> 12)); 156 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 157 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 158 } 159 else 160 { 161 m_outputBytes[count++] = (byte) (0xf0 + (c >> 18)); 162 m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f)); 163 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 164 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 165 } 166 167 } 168 169 170 /** 171 * Write a portion of an array of characters. 172 * 173 * @param chars Array of characters 174 * @param start Offset from which to start writing characters 175 * @param length Number of characters to write 176 * 177 * @exception IOException If an I/O error occurs 178 * 179 * @throws java.io.IOException 180 */ 181 public void write(final char chars[], final int start, final int length) 182 throws java.io.IOException 183 { 184 185 // We multiply the length by three since this is the maximum length 186 // of the characters that we can put into the buffer. It is possible 187 // for each Unicode character to expand to three bytes. 188 189 int lengthx3 = 3*length; 190 191 if (lengthx3 >= BYTES_MAX - count) 192 { 193 // The requested length is greater than the unused part of the buffer 194 flushBuffer(); 195 196 if (lengthx3 > BYTES_MAX) 197 { 198 /* 199 * The requested length exceeds the size of the buffer. 200 * Cut the buffer up into chunks, each of which will 201 * not cause an overflow to the output buffer m_outputBytes, 202 * and make multiple recursive calls. 203 * Be careful about integer overflows in multiplication. 204 */ 205 int split = length/CHARS_MAX; 206 final int chunks; 207 if (length % CHARS_MAX > 0) 208 chunks = split + 1; 209 else 210 chunks = split; 211 int end_chunk = start; 212 for (int chunk = 1; chunk <= chunks; chunk++) 213 { 214 int start_chunk = end_chunk; 215 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 216 217 // Adjust the end of the chunk if it ends on a high char 218 // of a Unicode surrogate pair and low char of the pair 219 // is not going to be in the same chunk 220 final char c = chars[end_chunk - 1]; 221 int ic = chars[end_chunk - 1]; 222 if (c >= 0xD800 && c <= 0xDBFF) { 223 // The last Java char that we were going 224 // to process is the first of a 225 // Java surrogate char pair that 226 // represent a Unicode character. 227 228 if (end_chunk < start + length) { 229 // Avoid spanning by including the low 230 // char in the current chunk of chars. 231 end_chunk++; 232 } else { 233 /* This is the last char of the last chunk, 234 * and it is the high char of a high/low pair with 235 * no low char provided. 236 * TODO: error message needed. 237 * The char array incorrectly ends in a high char 238 * of a high/low surrogate pair, but there is 239 * no corresponding low as the high is the last char 240 */ 241 end_chunk--; 242 } 243 } 244 245 246 int len_chunk = (end_chunk - start_chunk); 247 this.write(chars,start_chunk, len_chunk); 248 } 249 return; 250 } 251 } 252 253 254 255 final int n = length+start; 256 final byte[] buf_loc = m_outputBytes; // local reference for faster access 257 int count_loc = count; // local integer for faster access 258 int i = start; 259 { 260 /* This block could be omitted and the code would produce 261 * the same result. But this block exists to give the JIT 262 * a better chance of optimizing a tight and common loop which 263 * occurs when writing out ASCII characters. 264 */ 265 char c; 266 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 267 buf_loc[count_loc++] = (byte)c; 268 } 269 for (; i < n; i++) 270 { 271 272 final char c = chars[i]; 273 274 if (c < 0x80) 275 buf_loc[count_loc++] = (byte) (c); 276 else if (c < 0x800) 277 { 278 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 279 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 280 } 281 /** 282 * The following else if condition is added to support XML 1.1 Characters for 283 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 284 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 285 * [1101 11yy] [yyxx xxxx] (low surrogate) 286 * * uuuuu = wwww + 1 287 */ 288 else if (c >= 0xD800 && c <= 0xDBFF) 289 { 290 char high, low; 291 high = c; 292 i++; 293 low = chars[i]; 294 295 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 296 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 297 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 298 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 299 } 300 else 301 { 302 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 303 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 304 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 305 } 306 } 307 // Store the local integer back into the instance variable 308 count = count_loc; 309 310 } 311 312 /** 313 * Write a string. 314 * 315 * @param s String to be written 316 * 317 * @exception IOException If an I/O error occurs 318 */ 319 public void write(final String s) throws IOException 320 { 321 322 // We multiply the length by three since this is the maximum length 323 // of the characters that we can put into the buffer. It is possible 324 // for each Unicode character to expand to three bytes. 325 final int length = s.length(); 326 int lengthx3 = 3*length; 327 328 if (lengthx3 >= BYTES_MAX - count) 329 { 330 // The requested length is greater than the unused part of the buffer 331 flushBuffer(); 332 333 if (lengthx3 > BYTES_MAX) 334 { 335 /* 336 * The requested length exceeds the size of the buffer, 337 * so break it up in chunks that don't exceed the buffer size. 338 */ 339 final int start = 0; 340 int split = length/CHARS_MAX; 341 final int chunks; 342 if (length % CHARS_MAX > 0) 343 chunks = split + 1; 344 else 345 chunks = split; 346 int end_chunk = 0; 347 for (int chunk = 1; chunk <= chunks; chunk++) 348 { 349 int start_chunk = end_chunk; 350 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 351 s.getChars(start_chunk,end_chunk, m_inputChars,0); 352 int len_chunk = (end_chunk - start_chunk); 353 354 // Adjust the end of the chunk if it ends on a high char 355 // of a Unicode surrogate pair and low char of the pair 356 // is not going to be in the same chunk 357 final char c = m_inputChars[len_chunk - 1]; 358 if (c >= 0xD800 && c <= 0xDBFF) { 359 // Exclude char in this chunk, 360 // to avoid spanning a Unicode character 361 // that is in two Java chars as a high/low surrogate 362 end_chunk--; 363 len_chunk--; 364 if (chunk == chunks) { 365 /* TODO: error message needed. 366 * The String incorrectly ends in a high char 367 * of a high/low surrogate pair, but there is 368 * no corresponding low as the high is the last char 369 * Recover by ignoring this last char. 370 */ 371 } 372 } 373 374 this.write(m_inputChars,0, len_chunk); 375 } 376 return; 377 } 378 } 379 380 381 s.getChars(0, length , m_inputChars, 0); 382 final char[] chars = m_inputChars; 383 final int n = length; 384 final byte[] buf_loc = m_outputBytes; // local reference for faster access 385 int count_loc = count; // local integer for faster access 386 int i = 0; 387 { 388 /* This block could be omitted and the code would produce 389 * the same result. But this block exists to give the JIT 390 * a better chance of optimizing a tight and common loop which 391 * occurs when writing out ASCII characters. 392 */ 393 char c; 394 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 395 buf_loc[count_loc++] = (byte)c; 396 } 397 for (; i < n; i++) 398 { 399 400 final char c = chars[i]; 401 402 if (c < 0x80) 403 buf_loc[count_loc++] = (byte) (c); 404 else if (c < 0x800) 405 { 406 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 407 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 408 } 409 /** 410 * The following else if condition is added to support XML 1.1 Characters for 411 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 412 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 413 * [1101 11yy] [yyxx xxxx] (low surrogate) 414 * * uuuuu = wwww + 1 415 */ 416 else if (c >= 0xD800 && c <= 0xDBFF) 417 { 418 char high, low; 419 high = c; 420 i++; 421 low = chars[i]; 422 423 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 424 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 425 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 426 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 427 } 428 else 429 { 430 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 431 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 432 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 433 } 434 } 435 // Store the local integer back into the instance variable 436 count = count_loc; 437 438 } 439 440 /** 441 * Flush the internal buffer 442 * 443 * @throws IOException 444 */ 445 public void flushBuffer() throws IOException 446 { 447 448 if (count > 0) 449 { 450 m_os.write(m_outputBytes, 0, count); 451 452 count = 0; 453 } 454 } 455 456 /** 457 * Flush the stream. If the stream has saved any characters from the 458 * various write() methods in a buffer, write them immediately to their 459 * intended destination. Then, if that destination is another character or 460 * byte stream, flush it. Thus one flush() invocation will flush all the 461 * buffers in a chain of Writers and OutputStreams. 462 * 463 * @exception IOException If an I/O error occurs 464 * 465 * @throws java.io.IOException 466 */ 467 public void flush() throws java.io.IOException 468 { 469 flushBuffer(); 470 m_os.flush(); 471 } 472 473 /** 474 * Close the stream, flushing it first. Once a stream has been closed, 475 * further write() or flush() invocations will cause an IOException to be 476 * thrown. Closing a previously-closed stream, however, has no effect. 477 * 478 * @exception IOException If an I/O error occurs 479 * 480 * @throws java.io.IOException 481 */ 482 public void close() throws java.io.IOException 483 { 484 flushBuffer(); 485 m_os.close(); 486 } 487 488 /** 489 * Get the output stream where the events will be serialized to. 490 * 491 * @return reference to the result stream, or null of only a writer was 492 * set. 493 */ 494 public OutputStream getOutputStream() 495 { 496 return m_os; 497 } 498 499 public Writer getWriter() 500 { 501 // Only one of getWriter() or getOutputStream() can return null 502 // This type of writer wraps an OutputStream, not a Writer. 503 return null; 504 } 505 }