1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Licensed to the Apache Software Foundation (ASF) under one 7 * or more contributor license agreements. See the NOTICE file 8 * distributed with this work for additional information 9 * regarding copyright ownership. The ASF licenses this file 10 * to you under the Apache License, Version 2.0 (the "License"); 11 * you may not use this file except in compliance with the License. 12 * You may obtain a copy of the License at 13 * 14 * http://www.apache.org/licenses/LICENSE-2.0 15 * 16 * Unless required by applicable law or agreed to in writing, software 17 * distributed under the License is distributed on an "AS IS" BASIS, 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 * See the License for the specific language governing permissions and 20 * limitations under the License. 21 */ 22 /* 23 * $Id: WriterToUTF8Buffered.java,v 1.2.4.1 2005/09/15 08:15:31 suresh_emailid Exp $ 24 */ 25 package com.sun.org.apache.xml.internal.serializer; 26 27 import java.io.IOException; 28 import java.io.OutputStream; 29 import java.io.UnsupportedEncodingException; 30 import java.io.Writer; 31 32 33 /** 34 * This class writes unicode characters to a byte stream (java.io.OutputStream) 35 * as quickly as possible. It buffers the output in an internal 36 * buffer which must be flushed to the OutputStream when done. This flushing 37 * is done via the close() flush() or flushBuffer() method. 38 * 39 * This class is only used internally within Xalan. 40 * 41 * @xsl.usage internal 42 */ 43 final class WriterToUTF8Buffered extends Writer implements WriterChain 44 { 45 46 /** number of bytes that the byte buffer can hold. 47 * This is a fixed constant is used rather than m_outputBytes.lenght for performance. 48 */ 49 private static final int BYTES_MAX=16*1024; 50 /** number of characters that the character buffer can hold. 51 * This is 1/3 of the number of bytes because UTF-8 encoding 52 * can expand one unicode character by up to 3 bytes. 53 */ 54 private static final int CHARS_MAX=(BYTES_MAX/3); 55 56 // private static final int 57 58 /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */ 59 private final OutputStream m_os; 60 61 /** 62 * The internal buffer where data is stored. 63 * (sc & sb remove final to compile in JDK 1.1.8) 64 */ 65 private final byte m_outputBytes[]; 66 67 private final char m_inputChars[]; 68 69 /** 70 * The number of valid bytes in the buffer. This value is always 71 * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements 72 * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid 73 * byte data. 74 */ 75 private int count; 76 77 /** 78 * Create an buffered UTF-8 writer. 79 * 80 * 81 * @param out the underlying output stream. 82 * 83 * @throws UnsupportedEncodingException 84 */ 85 public WriterToUTF8Buffered(OutputStream out) 86 throws UnsupportedEncodingException 87 { 88 m_os = out; 89 // get 3 extra bytes to make buffer overflow checking simpler and faster 90 // we won't have to keep checking for a few extra characters 91 m_outputBytes = new byte[BYTES_MAX + 3]; 92 93 // Big enough to hold the input chars that will be transformed 94 // into output bytes in m_ouputBytes. 95 m_inputChars = new char[CHARS_MAX + 2]; 96 count = 0; 97 98 // the old body of this constructor, before the buffersize was changed to a constant 99 // this(out, 8*1024); 100 } 101 102 /** 103 * Create an buffered UTF-8 writer to write data to the 104 * specified underlying output stream with the specified buffer 105 * size. 106 * 107 * @param out the underlying output stream. 108 * @param size the buffer size. 109 * @exception IllegalArgumentException if size <= 0. 110 */ 111 // public WriterToUTF8Buffered(final OutputStream out, final int size) 112 // { 113 // 114 // m_os = out; 115 // 116 // if (size <= 0) 117 // { 118 // throw new IllegalArgumentException( 119 // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0"); 120 // } 121 // 122 // m_outputBytes = new byte[size]; 123 // count = 0; 124 // } 125 126 /** 127 * Write a single character. The character to be written is contained in 128 * the 16 low-order bits of the given integer value; the 16 high-order bits 129 * are ignored. 130 * 131 * <p> Subclasses that intend to support efficient single-character output 132 * should override this method. 133 * 134 * @param c int specifying a character to be written. 135 * @exception IOException If an I/O error occurs 136 */ 137 public void write(final int c) throws IOException 138 { 139 140 /* If we are close to the end of the buffer then flush it. 141 * Remember the buffer can hold a few more bytes than BYTES_MAX 142 */ 143 if (count >= BYTES_MAX) 144 flushBuffer(); 145 146 if (c < 0x80) 147 { 148 m_outputBytes[count++] = (byte) (c); 149 } 150 else if (c < 0x800) 151 { 152 m_outputBytes[count++] = (byte) (0xc0 + (c >> 6)); 153 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 154 } 155 else if (c < 0x10000) 156 { 157 m_outputBytes[count++] = (byte) (0xe0 + (c >> 12)); 158 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 159 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 160 } 161 else 162 { 163 m_outputBytes[count++] = (byte) (0xf0 + (c >> 18)); 164 m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f)); 165 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 166 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 167 } 168 169 } 170 171 172 /** 173 * Write a portion of an array of characters. 174 * 175 * @param chars Array of characters 176 * @param start Offset from which to start writing characters 177 * @param length Number of characters to write 178 * 179 * @exception IOException If an I/O error occurs 180 * 181 * @throws java.io.IOException 182 */ 183 public void write(final char chars[], final int start, final int length) 184 throws java.io.IOException 185 { 186 187 // We multiply the length by three since this is the maximum length 188 // of the characters that we can put into the buffer. It is possible 189 // for each Unicode character to expand to three bytes. 190 191 int lengthx3 = 3*length; 192 193 if (lengthx3 >= BYTES_MAX - count) 194 { 195 // The requested length is greater than the unused part of the buffer 196 flushBuffer(); 197 198 if (lengthx3 > BYTES_MAX) 199 { 200 /* 201 * The requested length exceeds the size of the buffer. 202 * Cut the buffer up into chunks, each of which will 203 * not cause an overflow to the output buffer m_outputBytes, 204 * and make multiple recursive calls. 205 * Be careful about integer overflows in multiplication. 206 */ 207 int split = length/CHARS_MAX; 208 final int chunks; 209 if (length % CHARS_MAX > 0) 210 chunks = split + 1; 211 else 212 chunks = split; 213 int end_chunk = start; 214 for (int chunk = 1; chunk <= chunks; chunk++) 215 { 216 int start_chunk = end_chunk; 217 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 218 219 // Adjust the end of the chunk if it ends on a high char 220 // of a Unicode surrogate pair and low char of the pair 221 // is not going to be in the same chunk 222 final char c = chars[end_chunk - 1]; 223 int ic = chars[end_chunk - 1]; 224 if (c >= 0xD800 && c <= 0xDBFF) { 225 // The last Java char that we were going 226 // to process is the first of a 227 // Java surrogate char pair that 228 // represent a Unicode character. 229 230 if (end_chunk < start + length) { 231 // Avoid spanning by including the low 232 // char in the current chunk of chars. 233 end_chunk++; 234 } else { 235 /* This is the last char of the last chunk, 236 * and it is the high char of a high/low pair with 237 * no low char provided. 238 * TODO: error message needed. 239 * The char array incorrectly ends in a high char 240 * of a high/low surrogate pair, but there is 241 * no corresponding low as the high is the last char 242 */ 243 end_chunk--; 244 } 245 } 246 247 248 int len_chunk = (end_chunk - start_chunk); 249 this.write(chars,start_chunk, len_chunk); 250 } 251 return; 252 } 253 } 254 255 256 257 final int n = length+start; 258 final byte[] buf_loc = m_outputBytes; // local reference for faster access 259 int count_loc = count; // local integer for faster access 260 int i = start; 261 { 262 /* This block could be omitted and the code would produce 263 * the same result. But this block exists to give the JIT 264 * a better chance of optimizing a tight and common loop which 265 * occurs when writing out ASCII characters. 266 */ 267 char c; 268 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 269 buf_loc[count_loc++] = (byte)c; 270 } 271 for (; i < n; i++) 272 { 273 274 final char c = chars[i]; 275 276 if (c < 0x80) 277 buf_loc[count_loc++] = (byte) (c); 278 else if (c < 0x800) 279 { 280 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 281 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 282 } 283 /** 284 * The following else if condition is added to support XML 1.1 Characters for 285 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 286 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 287 * [1101 11yy] [yyxx xxxx] (low surrogate) 288 * * uuuuu = wwww + 1 289 */ 290 else if (c >= 0xD800 && c <= 0xDBFF) 291 { 292 char high, low; 293 high = c; 294 i++; 295 low = chars[i]; 296 297 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 298 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 299 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 300 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 301 } 302 else 303 { 304 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 305 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 306 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 307 } 308 } 309 // Store the local integer back into the instance variable 310 count = count_loc; 311 312 } 313 314 /** 315 * Write a string. 316 * 317 * @param s String to be written 318 * 319 * @exception IOException If an I/O error occurs 320 */ 321 public void write(final String s) throws IOException 322 { 323 324 // We multiply the length by three since this is the maximum length 325 // of the characters that we can put into the buffer. It is possible 326 // for each Unicode character to expand to three bytes. 327 final int length = s.length(); 328 int lengthx3 = 3*length; 329 330 if (lengthx3 >= BYTES_MAX - count) 331 { 332 // The requested length is greater than the unused part of the buffer 333 flushBuffer(); 334 335 if (lengthx3 > BYTES_MAX) 336 { 337 /* 338 * The requested length exceeds the size of the buffer, 339 * so break it up in chunks that don't exceed the buffer size. 340 */ 341 final int start = 0; 342 int split = length/CHARS_MAX; 343 final int chunks; 344 if (length % CHARS_MAX > 0) 345 chunks = split + 1; 346 else 347 chunks = split; 348 int end_chunk = 0; 349 for (int chunk = 1; chunk <= chunks; chunk++) 350 { 351 int start_chunk = end_chunk; 352 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 353 s.getChars(start_chunk,end_chunk, m_inputChars,0); 354 int len_chunk = (end_chunk - start_chunk); 355 356 // Adjust the end of the chunk if it ends on a high char 357 // of a Unicode surrogate pair and low char of the pair 358 // is not going to be in the same chunk 359 final char c = m_inputChars[len_chunk - 1]; 360 if (c >= 0xD800 && c <= 0xDBFF) { 361 // Exclude char in this chunk, 362 // to avoid spanning a Unicode character 363 // that is in two Java chars as a high/low surrogate 364 end_chunk--; 365 len_chunk--; 366 if (chunk == chunks) { 367 /* TODO: error message needed. 368 * The String incorrectly ends in a high char 369 * of a high/low surrogate pair, but there is 370 * no corresponding low as the high is the last char 371 * Recover by ignoring this last char. 372 */ 373 } 374 } 375 376 this.write(m_inputChars,0, len_chunk); 377 } 378 return; 379 } 380 } 381 382 383 s.getChars(0, length , m_inputChars, 0); 384 final char[] chars = m_inputChars; 385 final int n = length; 386 final byte[] buf_loc = m_outputBytes; // local reference for faster access 387 int count_loc = count; // local integer for faster access 388 int i = 0; 389 { 390 /* This block could be omitted and the code would produce 391 * the same result. But this block exists to give the JIT 392 * a better chance of optimizing a tight and common loop which 393 * occurs when writing out ASCII characters. 394 */ 395 char c; 396 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 397 buf_loc[count_loc++] = (byte)c; 398 } 399 for (; i < n; i++) 400 { 401 402 final char c = chars[i]; 403 404 if (c < 0x80) 405 buf_loc[count_loc++] = (byte) (c); 406 else if (c < 0x800) 407 { 408 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 409 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 410 } 411 /** 412 * The following else if condition is added to support XML 1.1 Characters for 413 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 414 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 415 * [1101 11yy] [yyxx xxxx] (low surrogate) 416 * * uuuuu = wwww + 1 417 */ 418 else if (c >= 0xD800 && c <= 0xDBFF) 419 { 420 char high, low; 421 high = c; 422 i++; 423 low = chars[i]; 424 425 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 426 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 427 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 428 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 429 } 430 else 431 { 432 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 433 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 434 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 435 } 436 } 437 // Store the local integer back into the instance variable 438 count = count_loc; 439 440 } 441 442 /** 443 * Flush the internal buffer 444 * 445 * @throws IOException 446 */ 447 public void flushBuffer() throws IOException 448 { 449 450 if (count > 0) 451 { 452 m_os.write(m_outputBytes, 0, count); 453 454 count = 0; 455 } 456 } 457 458 /** 459 * Flush the stream. If the stream has saved any characters from the 460 * various write() methods in a buffer, write them immediately to their 461 * intended destination. Then, if that destination is another character or 462 * byte stream, flush it. Thus one flush() invocation will flush all the 463 * buffers in a chain of Writers and OutputStreams. 464 * 465 * @exception IOException If an I/O error occurs 466 * 467 * @throws java.io.IOException 468 */ 469 public void flush() throws java.io.IOException 470 { 471 flushBuffer(); 472 m_os.flush(); 473 } 474 475 /** 476 * Close the stream, flushing it first. Once a stream has been closed, 477 * further write() or flush() invocations will cause an IOException to be 478 * thrown. Closing a previously-closed stream, however, has no effect. 479 * 480 * @exception IOException If an I/O error occurs 481 * 482 * @throws java.io.IOException 483 */ 484 public void close() throws java.io.IOException 485 { 486 flushBuffer(); 487 m_os.close(); 488 } 489 490 /** 491 * Get the output stream where the events will be serialized to. 492 * 493 * @return reference to the result stream, or null of only a writer was 494 * set. 495 */ 496 public OutputStream getOutputStream() 497 { 498 return m_os; 499 } 500 501 public Writer getWriter() 502 { 503 // Only one of getWriter() or getOutputStream() can return null 504 // This type of writer wraps an OutputStream, not a Writer. 505 return null; 506 } 507 }