src/com/sun/org/apache/xerces/internal/impl/XMLEntityManager.java

Print this page




2592      * Passed a URI that contains invalid characters (like spaces, non-ASCII Unicode characters, and the like),
2593      * this function percent encodes the invalid characters per the URI specification (i.e., as a sequence of
2594      * %-encoded UTF-8 octets).
2595      *
2596      * N.B. There are two problems. If the URI contains a '%' character, that might be an indication that
2597      * the URI has already been escaped by the author, or it might be an invalid '%'. In the former case,
2598      * it's important not to escape it, or we'll wind up with invalid, doubly-escaped '%'s. In the latter,
2599      * the URI is broken if we don't encode it. Similarly, a '#' character might be the start of a fragment
2600      * identifier or it might be an invalid '#'.
2601      *
2602      * Given that the former is vastly more likely than the latter in each case (most users are familiar with
2603      * the magic status of '%' and '#' and they occur relatively infrequently in filenames, and if the user parses
2604      * a proper Java File, we will already have %-escaped the URI), we simply assume that %'s and #'s are legit.
2605      *
2606      * Very rarely, we may be wrong. If so, tell the user to fix the clearly broken URI.
2607      */
2608     protected static String escapeNonUSAscii(String str) {
2609         if (str == null) {
2610             return str;
2611         }












2612 
2613         // get UTF-8 bytes for the string
2614         StringBuffer buffer = new StringBuffer();
2615         byte[] bytes = null;
2616         byte b;
2617         try {
2618             bytes = str.getBytes("UTF-8");
2619         } catch (java.io.UnsupportedEncodingException e) {
2620             // should never happen
2621             return str;
2622         }
2623         int len = bytes.length;
2624         int ch;
2625 
2626         // for each byte
2627         for (int i = 0; i < len; i++) {
2628             b = bytes[i];
2629             // for non-ascii character: make it positive, then escape
2630             if (b < 0) {
2631                 ch = b + 256;
2632                 buffer.append('%');
2633                 buffer.append(gHexChs[ch >> 4]);
2634                 buffer.append(gHexChs[ch & 0xf]);
2635             }
2636             else if (b != '%' && b != '#' && gNeedEscaping[b]) {
2637                 buffer.append('%');
2638                 buffer.append(gAfterEscaping1[b]);
2639                 buffer.append(gAfterEscaping2[b]);
2640             }
2641             else {
2642                 buffer.append((char)b);
2643             }
2644         }
2645         return buffer.toString();
2646     }
2647 




2592      * Passed a URI that contains invalid characters (like spaces, non-ASCII Unicode characters, and the like),
2593      * this function percent encodes the invalid characters per the URI specification (i.e., as a sequence of
2594      * %-encoded UTF-8 octets).
2595      *
2596      * N.B. There are two problems. If the URI contains a '%' character, that might be an indication that
2597      * the URI has already been escaped by the author, or it might be an invalid '%'. In the former case,
2598      * it's important not to escape it, or we'll wind up with invalid, doubly-escaped '%'s. In the latter,
2599      * the URI is broken if we don't encode it. Similarly, a '#' character might be the start of a fragment
2600      * identifier or it might be an invalid '#'.
2601      *
2602      * Given that the former is vastly more likely than the latter in each case (most users are familiar with
2603      * the magic status of '%' and '#' and they occur relatively infrequently in filenames, and if the user parses
2604      * a proper Java File, we will already have %-escaped the URI), we simply assume that %'s and #'s are legit.
2605      *
2606      * Very rarely, we may be wrong. If so, tell the user to fix the clearly broken URI.
2607      */
2608     protected static String escapeNonUSAscii(String str) {
2609         if (str == null) {
2610             return str;
2611         }
2612         int len = str.length(), i=0, ch;
2613         for (; i < len; i++) {
2614             ch = str.charAt(i);
2615             // if it's not an ASCII 7 character, break here, and use UTF-8 encoding
2616             if (ch >= 128)
2617                 break;
2618         }
2619 
2620         // we saw no non-ascii-7 character
2621         if (i == len) {
2622             return str;
2623         }
2624 
2625         // get UTF-8 bytes for the string
2626         StringBuffer buffer = new StringBuffer();
2627         byte[] bytes = null;
2628         byte b;
2629         try {
2630             bytes = str.getBytes("UTF-8");
2631         } catch (java.io.UnsupportedEncodingException e) {
2632             // should never happen
2633             return str;
2634         }
2635 
2636         len = bytes.length;
2637 
2638         // for each byte
2639         for (i = 0; i < len; i++) {
2640             b = bytes[i];
2641             // for non-ascii character: make it positive, then escape
2642             if (b < 0) {
2643                 ch = b + 256;
2644                 buffer.append('%');
2645                 buffer.append(gHexChs[ch >> 4]);
2646                 buffer.append(gHexChs[ch & 0xf]);
2647             }
2648             else if (b != '%' && b != '#' && gNeedEscaping[b]) {
2649                 buffer.append('%');
2650                 buffer.append(gAfterEscaping1[b]);
2651                 buffer.append(gAfterEscaping2[b]);
2652             }
2653             else {
2654                 buffer.append((char)b);
2655             }
2656         }
2657         return buffer.toString();
2658     }
2659