< prev index next >

src/share/vm/utilities/utf8.cpp

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. --- 1,7 ---- /* ! * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation.
*** 25,35 **** #include "precompiled.hpp" #include "utilities/utf8.hpp" // Assume the utf8 string is in legal form and has been // checked in the class file parser/format checker. ! char* UTF8::next(const char* str, jchar* value) { unsigned const char *ptr = (const unsigned char *)str; unsigned char ch, ch2, ch3; int length = -1; /* bad length */ jchar result; switch ((ch = ptr[0]) >> 4) { --- 25,35 ---- #include "precompiled.hpp" #include "utilities/utf8.hpp" // Assume the utf8 string is in legal form and has been // checked in the class file parser/format checker. ! template<typename T> char* UTF8::next(const char* str, T* value) { unsigned const char *ptr = (const unsigned char *)str; unsigned char ch, ch2, ch3; int length = -1; /* bad length */ jchar result; switch ((ch = ptr[0]) >> 4) {
*** 66,80 **** } break; } /* end of switch */ if (length <= 0) { ! *value = ptr[0]; /* default bad result; */ return (char*)(ptr + 1); // make progress somehow } ! *value = result; // The assert is correct but the .class file is wrong // assert(UNICODE::utf8_size(result) == length, "checking reverse computation"); return (char *)(ptr + length); } --- 66,80 ---- } break; } /* end of switch */ if (length <= 0) { ! *value = (T)ptr[0]; /* default bad result; */ return (char*)(ptr + 1); // make progress somehow } ! *value = (T)result; // The assert is correct but the .class file is wrong // assert(UNICODE::utf8_size(result) == length, "checking reverse computation"); return (char *)(ptr + length); }
*** 94,128 **** } // Count bytes of the form 10xxxxxx and deduct this count // from the total byte count. The utf8 string must be in // legal form which has been verified in the format checker. ! int UTF8::unicode_length(const char* str, int len) { int num_chars = len; for (int i = 0; i < len; i++) { ! if ((str[i] & 0xC0) == 0x80) { --num_chars; } } return num_chars; } // Count bytes of the utf8 string except those in form // 10xxxxxx which only appear in multibyte characters. // The utf8 string must be in legal form and has been // verified in the format checker. ! int UTF8::unicode_length(const char* str) { int num_chars = 0; for (const char* p = str; *p; p++) { ! if (((*p) & 0xC0) != 0x80) { num_chars++; } } return num_chars; } ! // Writes a jchar a utf8 and returns the end static u_char* utf8_write(u_char* base, jchar ch) { if ((ch != 0) && (ch <=0x7f)) { base[0] = (u_char) ch; return base + 1; } --- 94,149 ---- } // Count bytes of the form 10xxxxxx and deduct this count // from the total byte count. The utf8 string must be in // legal form which has been verified in the format checker. ! int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) { int num_chars = len; + has_multibyte = false; + is_latin1 = true; + unsigned char prev = 0; for (int i = 0; i < len; i++) { ! unsigned char c = str[i]; ! if ((c & 0xC0) == 0x80) { ! // Multibyte, check if valid latin1 character. ! has_multibyte = true; ! if (prev > 0xC3) { ! is_latin1 = false; ! } --num_chars; } + prev = c; } return num_chars; } // Count bytes of the utf8 string except those in form // 10xxxxxx which only appear in multibyte characters. // The utf8 string must be in legal form and has been // verified in the format checker. ! int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) { int num_chars = 0; + has_multibyte = false; + is_latin1 = true; + unsigned char prev = 0; for (const char* p = str; *p; p++) { ! unsigned char c = (*p); ! if ((c & 0xC0) == 0x80) { ! // Multibyte, check if valid latin1 character. ! has_multibyte = true; ! if (prev > 0xC3) { ! is_latin1 = false; ! } ! } else { num_chars++; } + prev = c; } return num_chars; } ! // Writes a jchar as utf8 and returns the end static u_char* utf8_write(u_char* base, jchar ch) { if ((ch != 0) && (ch <=0x7f)) { base[0] = (u_char) ch; return base + 1; }
*** 143,169 **** base[1] = mid_six | 0x80; /* 10xxxxxx */ base[2] = low_six | 0x80; /* 10xxxxxx */ return base + 3; } ! void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) { unsigned char ch; const char *ptr = utf8_str; int index = 0; /* ASCII case loop optimization */ for (; index < unicode_length; index++) { if((ch = ptr[0]) > 0x7F) { break; } ! unicode_str[index] = ch; ptr = (const char *)(ptr + 1); } for (; index < unicode_length; index++) { ptr = UTF8::next(ptr, &unicode_str[index]); } } // returns the quoted ascii length of a 0-terminated utf8 string int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { const char *ptr = utf8_str; const char* end = ptr + utf8_length; int result = 0; --- 164,196 ---- base[1] = mid_six | 0x80; /* 10xxxxxx */ base[2] = low_six | 0x80; /* 10xxxxxx */ return base + 3; } ! template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) { unsigned char ch; const char *ptr = utf8_str; int index = 0; /* ASCII case loop optimization */ for (; index < unicode_length; index++) { if((ch = ptr[0]) > 0x7F) { break; } ! unicode_str[index] = (T)ch; ptr = (const char *)(ptr + 1); } for (; index < unicode_length; index++) { ptr = UTF8::next(ptr, &unicode_str[index]); } } + // Explicit instantiation for all supported string types. + template char* UTF8::next<jchar>(const char* str, jchar* value); + template char* UTF8::next<jbyte>(const char* str, jbyte* value); + template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length); + template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length); + // returns the quoted ascii length of a 0-terminated utf8 string int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { const char *ptr = utf8_str; const char* end = ptr + utf8_length; int result = 0;
*** 304,323 **** jint UTF8::get_supplementary_character(const unsigned char* str) { return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10) + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); } - //------------------------------------------------------------------------------------- int UNICODE::utf8_size(jchar c) { if ((0x0001 <= c) && (c <= 0x007F)) return 1; if (c <= 0x07FF) return 2; return 3; } int UNICODE::utf8_length(jchar* base, int length) { int result = 0; for (int index = 0; index < length; index++) { jchar c = base[index]; if ((0x0001 <= c) && (c <= 0x007F)) result += 1; --- 331,366 ---- jint UTF8::get_supplementary_character(const unsigned char* str) { return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10) + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); } //------------------------------------------------------------------------------------- + bool UNICODE::is_latin1(jchar c) { + return (c <= 0x00FF); + } + + bool UNICODE::is_latin1(jchar* base, int length) { + for (int index = 0; index < length; index++) { + if (base[index] > 0x00FF) { + return false; + } + } + return true; + } int UNICODE::utf8_size(jchar c) { if ((0x0001 <= c) && (c <= 0x007F)) return 1; if (c <= 0x07FF) return 2; return 3; } + int UNICODE::utf8_size(jbyte c) { + if (c >= 0x0001) return 1; + return 2; + } + int UNICODE::utf8_length(jchar* base, int length) { int result = 0; for (int index = 0; index < length; index++) { jchar c = base[index]; if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
*** 325,342 **** --- 368,414 ---- else result += 3; } return result; } + int UNICODE::utf8_length(jbyte* base, int length) { + int result = 0; + for (int index = 0; index < length; index++) { + jbyte c = base[index]; + result += utf8_size(c); + } + return result; + } + char* UNICODE::as_utf8(jchar* base, int length) { int utf8_len = utf8_length(base, length); u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); assert((int) strlen(result) == utf8_len, "length prediction must be correct"); return result; } + char* UNICODE::as_utf8(jbyte* base, int length) { + int utf8_len = utf8_length(base, length); + u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); + u_char* p = result; + if (utf8_len == length) { + for (int index = 0; index < length; index++) { + *p++ = base[index]; + } + } else { + // Unicode string contains U+0000 which should + // be encoded as 0xC080 in "modified" UTF8. + for (int index = 0; index < length; index++) { + p = utf8_write(p, ((jchar) base[index]) & 0xff); + } + } + *p = '\0'; + assert(p == &result[utf8_len], "length prediction must be correct"); + return (char*) result; + } + char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) { u_char* p = (u_char*)buf; for (int index = 0; index < length; index++) { jchar c = base[index]; buflen -= utf8_size(c);
*** 345,381 **** } *p = '\0'; return buf; } void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { for(int index = 0; index < length; index++) { utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); } *utf8_buffer = '\0'; } // returns the quoted ascii length of a unicode string ! int UNICODE::quoted_ascii_length(jchar* base, int length) { int result = 0; for (int i = 0; i < length; i++) { ! jchar c = base[i]; if (c >= 32 && c < 127) { result++; } else { result += 6; } } return result; } ! // converts a utf8 string to quoted ascii ! void UNICODE::as_quoted_ascii(const jchar* base, int length, char* buf, int buflen) { char* p = buf; char* end = buf + buflen; for (int index = 0; index < length; index++) { ! jchar c = base[index]; if (c >= 32 && c < 127) { if (p + 1 >= end) break; // string is truncated *p++ = (char)c; } else { if (p + 6 >= end) break; // string is truncated --- 417,475 ---- } *p = '\0'; return buf; } + char* UNICODE::as_utf8(jbyte* base, int length, char* buf, int buflen) { + u_char* p = (u_char*)buf; + u_char* end = (u_char*)buf + buflen; + for (int index = 0; index < length; index++) { + jbyte c = base[index]; + int sz = utf8_size(c); + buflen -= sz; + if (buflen <= 0) break; // string is truncated + if (sz == 1) { + *p++ = c; + } else { + // Unicode string contains U+0000 which should + // be encoded as 0xC080 in "modified" UTF8. + p = utf8_write(p, ((jchar) c) & 0xff); + } + } + *p = '\0'; + return buf; + } + void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { for(int index = 0; index < length; index++) { utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); } *utf8_buffer = '\0'; } // returns the quoted ascii length of a unicode string ! template<typename T> ! int UNICODE::quoted_ascii_length(T* base, int length) { int result = 0; for (int i = 0; i < length; i++) { ! T c = base[i]; if (c >= 32 && c < 127) { result++; } else { result += 6; } } return result; } ! // converts a unicode string to quoted ascii ! template<typename T> ! void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { char* p = buf; char* end = buf + buflen; for (int index = 0; index < length; index++) { ! T c = base[index]; if (c >= 32 && c < 127) { if (p + 1 >= end) break; // string is truncated *p++ = (char)c; } else { if (p + 6 >= end) break; // string is truncated
*** 384,393 **** --- 478,494 ---- } } *p = '\0'; } + // Explicit instantiation for all supported types. + template int UNICODE::quoted_ascii_length<jbyte>(jbyte* base, int length); + template int UNICODE::quoted_ascii_length<jchar>(jchar* base, int length); + template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen); + template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen); + + #ifndef PRODUCT void TestAsUtf8() { char res[60]; jchar str[20];
< prev index next >