1 /*
   2  * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.internal.util.xml.impl;
  27 
  28 import java.io.Reader;
  29 import java.io.InputStream;
  30 import java.io.IOException;
  31 import java.io.UnsupportedEncodingException;
  32 
  33 /**
  34  * UTF-8 transformed UCS-2 character stream reader.
  35  *
  36  * This reader converts UTF-8 transformed UCS-2 characters to Java characters.
  37  * The UCS-2 subset of UTF-8 transformation is described in RFC-2279 #2 
  38  * "UTF-8 definition":
  39  *  0000 0000-0000 007F   0xxxxxxx
  40  *  0000 0080-0000 07FF   110xxxxx 10xxxxxx
  41  *  0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
  42  *
  43  * This reader will return incorrect last character on broken UTF-8 stream. 
  44  */
  45 public class ReaderUTF8
  46         extends Reader
  47 {
  48         private InputStream is;
  49 
  50         /**
  51          * Constructor.
  52          *
  53          * @param is A byte input stream.
  54          */
  55         public ReaderUTF8(InputStream is)
  56         {
  57                 this.is = is;
  58         }
  59 
  60         /**
  61          * Reads characters into a portion of an array.
  62          *
  63          * @param cbuf Destination buffer.
  64          * @param off Offset at which to start storing characters.
  65          * @param len Maximum number of characters to read.
  66          * @exception IOException If any IO errors occur.
  67          * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
  68          */
  69         public int read(char[] cbuf, int off, int len)
  70                 throws IOException
  71         {
  72                 int  num = 0;
  73                 int  val;
  74                 while (num < len) {
  75                         if ((val = is.read()) < 0)
  76                                 return (num != 0)? num: -1;
  77                         switch (val & 0xf0) {
  78                         case 0xc0:
  79                         case 0xd0:
  80                                 cbuf[off++] = (char)(((val & 0x1f) << 6) | (is.read() & 0x3f));
  81                                 break;
  82 
  83                         case 0xe0:
  84                                 cbuf[off++] = (char)(((val & 0x0f) << 12) | 
  85                                         ((is.read() & 0x3f) << 6) | (is.read() & 0x3f));
  86                                 break;
  87 
  88                         case 0xf0:      // UCS-4 character
  89                                 throw new UnsupportedEncodingException("UTF-32 (or UCS-4) encoding not supported.");
  90 
  91                         default:
  92                                 cbuf[off++] = (char)val;
  93                                 break;
  94                         }
  95                         num++;
  96                 }
  97                 return num;
  98         }
  99 
 100         /**
 101          * Reads a single character.
 102          *
 103          * @return The character read, as an integer in the range 0 to 65535 
 104          *      (0x00-0xffff), or -1 if the end of the stream has been reached.
 105          * @exception IOException If any IO errors occur.
 106          * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
 107          */
 108         public int read()
 109                 throws IOException
 110         {
 111                 int  val;
 112                 if ((val = is.read()) < 0)
 113                         return -1;
 114                 switch (val & 0xf0) {
 115                 case 0xc0:
 116                 case 0xd0:
 117                         val = ((val & 0x1f) << 6) | (is.read() & 0x3f);
 118                         break;
 119 
 120                 case 0xe0:
 121                         val = ((val & 0x0f) << 12) | 
 122                                 ((is.read() & 0x3f) << 6) | (is.read() & 0x3f);
 123                         break;
 124 
 125                 case 0xf0:      // UCS-4 character
 126                         throw new UnsupportedEncodingException();
 127 
 128                 default:
 129                         break;
 130                 }
 131                 return val;
 132         }
 133 
 134         /**
 135          * Closes the stream.
 136          *
 137          * @exception IOException If any IO errors occur.
 138          */
 139         public void close()
 140                 throws IOException
 141         {
 142                 is.close();
 143         }
 144 }