2592 * Passed a URI that contains invalid characters (like spaces, non-ASCII Unicode characters, and the like), 2593 * this function percent encodes the invalid characters per the URI specification (i.e., as a sequence of 2594 * %-encoded UTF-8 octets). 2595 * 2596 * N.B. There are two problems. If the URI contains a '%' character, that might be an indication that 2597 * the URI has already been escaped by the author, or it might be an invalid '%'. In the former case, 2598 * it's important not to escape it, or we'll wind up with invalid, doubly-escaped '%'s. In the latter, 2599 * the URI is broken if we don't encode it. Similarly, a '#' character might be the start of a fragment 2600 * identifier or it might be an invalid '#'. 2601 * 2602 * Given that the former is vastly more likely than the latter in each case (most users are familiar with 2603 * the magic status of '%' and '#' and they occur relatively infrequently in filenames, and if the user parses 2604 * a proper Java File, we will already have %-escaped the URI), we simply assume that %'s and #'s are legit. 2605 * 2606 * Very rarely, we may be wrong. If so, tell the user to fix the clearly broken URI. 2607 */ 2608 protected static String escapeNonUSAscii(String str) { 2609 if (str == null) { 2610 return str; 2611 } 2612 2613 // get UTF-8 bytes for the string 2614 StringBuffer buffer = new StringBuffer(); 2615 byte[] bytes = null; 2616 byte b; 2617 try { 2618 bytes = str.getBytes("UTF-8"); 2619 } catch (java.io.UnsupportedEncodingException e) { 2620 // should never happen 2621 return str; 2622 } 2623 int len = bytes.length; 2624 int ch; 2625 2626 // for each byte 2627 for (int i = 0; i < len; i++) { 2628 b = bytes[i]; 2629 // for non-ascii character: make it positive, then escape 2630 if (b < 0) { 2631 ch = b + 256; 2632 buffer.append('%'); 2633 buffer.append(gHexChs[ch >> 4]); 2634 buffer.append(gHexChs[ch & 0xf]); 2635 } 2636 else if (b != '%' && b != '#' && gNeedEscaping[b]) { 2637 buffer.append('%'); 2638 buffer.append(gAfterEscaping1[b]); 2639 buffer.append(gAfterEscaping2[b]); 2640 } 2641 else { 2642 buffer.append((char)b); 2643 } 2644 } 2645 return buffer.toString(); 2646 } 2647 | 2592 * Passed a URI that contains invalid characters (like spaces, non-ASCII Unicode characters, and the like), 2593 * this function percent encodes the invalid characters per the URI specification (i.e., as a sequence of 2594 * %-encoded UTF-8 octets). 2595 * 2596 * N.B. There are two problems. If the URI contains a '%' character, that might be an indication that 2597 * the URI has already been escaped by the author, or it might be an invalid '%'. In the former case, 2598 * it's important not to escape it, or we'll wind up with invalid, doubly-escaped '%'s. In the latter, 2599 * the URI is broken if we don't encode it. Similarly, a '#' character might be the start of a fragment 2600 * identifier or it might be an invalid '#'. 2601 * 2602 * Given that the former is vastly more likely than the latter in each case (most users are familiar with 2603 * the magic status of '%' and '#' and they occur relatively infrequently in filenames, and if the user parses 2604 * a proper Java File, we will already have %-escaped the URI), we simply assume that %'s and #'s are legit. 2605 * 2606 * Very rarely, we may be wrong. If so, tell the user to fix the clearly broken URI. 2607 */ 2608 protected static String escapeNonUSAscii(String str) { 2609 if (str == null) { 2610 return str; 2611 } 2612 int len = str.length(), i=0, ch; 2613 for (; i < len; i++) { 2614 ch = str.charAt(i); 2615 // if it's not an ASCII 7 character, break here, and use UTF-8 encoding 2616 if (ch >= 128) 2617 break; 2618 } 2619 2620 // we saw no non-ascii-7 character 2621 if (i == len) { 2622 return str; 2623 } 2624 2625 // get UTF-8 bytes for the string 2626 StringBuffer buffer = new StringBuffer(); 2627 byte[] bytes = null; 2628 byte b; 2629 try { 2630 bytes = str.getBytes("UTF-8"); 2631 } catch (java.io.UnsupportedEncodingException e) { 2632 // should never happen 2633 return str; 2634 } 2635 2636 len = bytes.length; 2637 2638 // for each byte 2639 for (i = 0; i < len; i++) { 2640 b = bytes[i]; 2641 // for non-ascii character: make it positive, then escape 2642 if (b < 0) { 2643 ch = b + 256; 2644 buffer.append('%'); 2645 buffer.append(gHexChs[ch >> 4]); 2646 buffer.append(gHexChs[ch & 0xf]); 2647 } 2648 else if (b != '%' && b != '#' && gNeedEscaping[b]) { 2649 buffer.append('%'); 2650 buffer.append(gAfterEscaping1[b]); 2651 buffer.append(gAfterEscaping2[b]); 2652 } 2653 else { 2654 buffer.append((char)b); 2655 } 2656 } 2657 return buffer.toString(); 2658 } 2659 |