< prev index next >
src/java.base/share/native/libjava/jni_util.c
Print this page
rev 17266 : 8181147: JNI_GetStringPlatformChars should have a fast path for UTF-8
Reviewed-by: shade, chegar, erikj
@@ -442,13 +442,12 @@
return obj;
}
/* Optimized for char set ISO_8559_1 */
static jstring
-newString8859_1(JNIEnv *env, const char *str)
+newSizedString8859_1(JNIEnv *env, const char *str, int len)
{
- int len = (int)strlen(str);
jchar buf[512];
jchar *str1;
jstring result;
int i;
@@ -467,10 +466,17 @@
if (str1 != buf)
free(str1);
return result;
}
+static jstring
+newString8859_1(JNIEnv *env, const char *str)
+{
+ int len = (int)strlen(str);
+ return newSizedString8859_1(env, str, len);
+}
+
static const char*
getString8859_1Chars(JNIEnv *env, jstring jstr)
{
int i;
char *result;
@@ -498,10 +504,128 @@
result[len] = 0;
(*env)->ReleaseStringCritical(env, jstr, str);
return result;
}
+/* Real UTF-8, adapted from java.lang.StringCoding.encodeUTF8 */
+#define MIN_HIGH_SURROGATE 0xD800
+#define MAX_HIGH_SURROGATE 0xDBFF
+#define MIN_LOW_SURROGATE 0xDC00
+#define MAX_LOW_SURROGATE 0xDFFF
+#define MAX_SURROGATE MAX_LOW_SURROGATE
+#define MIN_SURROGATE MIN_HIGH_SURROGATE
+#define MIN_SUPPLEMENTARY_CODE_POINT 0x010000
+
+static jboolean isSurrogate(jchar unicode) {
+ return unicode >= MIN_SURROGATE && unicode < (MAX_SURROGATE + 1);
+}
+
+static jboolean isLowSurrogate(jchar unicode) {
+ return unicode >= MIN_LOW_SURROGATE && unicode < (MAX_LOW_SURROGATE + 1);
+}
+
+static jboolean isHighSurrogate(jchar unicode) {
+ return unicode >= MIN_HIGH_SURROGATE && unicode < (MAX_HIGH_SURROGATE + 1);
+}
+static int toCodePoint(jchar high, jchar low) {
+ // Optimized form of:
+ // return ((high - MIN_HIGH_SURROGATE) << 10)
+ // + (low - MIN_LOW_SURROGATE)
+ // + MIN_SUPPLEMENTARY_CODE_POINT;
+ return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
+ - (MIN_HIGH_SURROGATE << 10)
+ - MIN_LOW_SURROGATE);
+}
+
+static const char*
+getStringUTF8(JNIEnv *env, jstring jstr)
+{
+ jsize i;
+ int dp = 0;
+ char *result;
+ jsize len = (*env)->GetStringLength(env, jstr);
+ jsize size = 0;
+
+ const jchar *str = (*env)->GetStringChars(env, jstr, NULL);
+ if (str == NULL) {
+ return NULL;
+ }
+
+ for (i = 0; i < len;) {
+ jchar c = str[i++];
+ if (c <= 0x0080) {
+ size++;
+ } else if (c <= 0x0800) {
+ size += 2;
+ } else if (isSurrogate(c)) { // isSurrogate
+ int uc = -1;
+ if (isHighSurrogate(c) && i < len &&
+ isLowSurrogate(str[i])) {
+ uc = toCodePoint(c, str[i]);
+ }
+ if (uc < 0) {
+ size++;
+ } else {
+ size += 4;
+ i++; // 2 chars
+ }
+ } else {
+ // 3 bytes, 16 bits
+ size += 3;
+ }
+ }
+
+ result = MALLOC_MIN4(size);
+
+ if (result == NULL) {
+ (*env)->ReleaseStringChars(env, jstr, str);
+ JNU_ThrowOutOfMemoryError(env, NULL);
+ return 0;
+ }
+
+ if (size == len) {
+ // ASCII fast path
+ for (i = 0; i < len;) {
+ result[dp++] = (char)str[i++];
+ }
+ } else {
+ for (i = 0; i < len;) {
+ jchar c = str[i++];
+ if (c <= 0x0080) {
+ result[dp++] = (char)c;
+ } else if (c <= 0x0800) {
+ result[dp++] = (char)(0xc0 | (c >> 6));
+ result[dp++] = (char)(0x80 | (c & 0x3f));
+ } else if (isSurrogate(c)) { // isSurrogate
+ int uc = -1;
+ if (isHighSurrogate(c) && i < len &&
+ isLowSurrogate(str[i])) {
+ uc = toCodePoint(c, str[i]);
+ }
+ if (uc < 0) {
+ result[dp++] = '?';
+ } else {
+ result[dp++] = (char)(0xf0 | ((uc >> 18)));
+ result[dp++] = (char)(0x80 | ((uc >> 12) & 0x3f));
+ result[dp++] = (char)(0x80 | ((uc >> 6) & 0x3f));
+ result[dp++] = (char)(0x80 | (uc & 0x3f));
+ i++; // 2 chars
+ }
+ } else {
+ // 3 bytes, 16 bits
+ result[dp++] = (char)(0xe0 | ((c >> 12)));
+ result[dp++] = (char)(0x80 | ((c >> 6) & 0x3f));
+ result[dp++] = (char)(0x80 | (c & 0x3f));
+ }
+ }
+ }
+
+ result[size] = 0;
+ (*env)->ReleaseStringChars(env, jstr, str);
+ return result;
+}
+
/* Optimized for char set ISO646-US (us-ascii) */
static jstring
newString646_US(JNIEnv *env, const char *str)
{
@@ -667,16 +791,46 @@
static int fastEncoding = NO_ENCODING_YET;
static jstring jnuEncoding = NULL;
/* Cached method IDs */
static jmethodID String_init_ID; /* String(byte[], enc) */
+static jmethodID String_init_coder_ID; /* String(byte[], byte) */
static jmethodID String_getBytes_ID; /* String.getBytes(enc) */
-int getFastEncoding() {
- return fastEncoding;
+/* Optimized for char set UTF-8 */
+static jstring
+newStringUTF8(JNIEnv *env, const char *str)
+{
+ jboolean isAscii = JNI_TRUE;
+ jstring result;
+ jbyteArray hab = NULL;
+ int len = 0;
+ char b;
+ for (b = str[len]; b != '\0'; len++, b = str[len]) {
+ if (isAscii && b & 0x80) {
+ isAscii = JNI_FALSE;
+ }
+ }
+
+ if (isAscii) {
+ return newSizedString8859_1(env, str, len);
+ }
+
+ hab = (*env)->NewByteArray(env, len);
+ if (hab != 0) {
+ jclass strClazz = JNU_ClassString(env);
+ CHECK_NULL_RETURN(strClazz, 0);
+ (*env)->SetByteArrayRegion(env, hab, 0, len, (jbyte *)str);
+ result = (*env)->NewObject(env, strClazz,
+ String_init_ID, hab, jnuEncoding);
+ (*env)->DeleteLocalRef(env, hab);
+ return result;
+ }
+ return NULL;
}
+
/* Initialize the fast encoding. If the "sun.jnu.encoding" property
* has not yet been set, we leave fastEncoding == NO_ENCODING_YET.
*/
void
initializeEncoding(JNIEnv *env)
@@ -716,21 +870,24 @@
* "en_UK" locale -> "ISO8859-1" (on 2.6)
*/
if ((strcmp(encname, "8859_1") == 0) ||
(strcmp(encname, "ISO8859-1") == 0) ||
(strcmp(encname, "ISO8859_1") == 0) ||
- (strcmp(encname, "ISO-8859-1") == 0))
+ (strcmp(encname, "ISO-8859-1") == 0)) {
fastEncoding = FAST_8859_1;
- else if (strcmp(encname, "ISO646-US") == 0)
+ } else if (strcmp(encname, "UTF-8") == 0) {
+ fastEncoding = FAST_UTF_8;
+ jnuEncoding = (jstring)(*env)->NewGlobalRef(env, enc);
+ } else if (strcmp(encname, "ISO646-US") == 0) {
fastEncoding = FAST_646_US;
- else if (strcmp(encname, "Cp1252") == 0 ||
+ } else if (strcmp(encname, "Cp1252") == 0 ||
/* This is a temporary fix until we move */
/* to wide character versions of all Windows */
/* calls. */
- strcmp(encname, "utf-16le") == 0)
+ strcmp(encname, "utf-16le") == 0) {
fastEncoding = FAST_CP1252;
- else {
+ } else {
fastEncoding = NO_FAST_ENCODING;
jnuEncoding = (jstring)(*env)->NewGlobalRef(env, enc);
}
(*env)->ReleaseStringUTFChars(env, enc, encname);
}
@@ -748,10 +905,12 @@
String_getBytes_ID = (*env)->GetMethodID(env, strClazz,
"getBytes", "(Ljava/lang/String;)[B");
CHECK_NULL(String_getBytes_ID);
String_init_ID = (*env)->GetMethodID(env, strClazz,
"<init>", "([BLjava/lang/String;)V");
+ String_init_coder_ID = (*env)->GetMethodID(env, strClazz,
+ "<init>", "([BB)V");
}
static jboolean isJNUEncodingSupported = JNI_FALSE;
static jboolean jnuEncodingSupported(JNIEnv *env) {
jboolean exe;
@@ -790,10 +949,12 @@
return newString8859_1(env, str);
if (fastEncoding == FAST_646_US)
return newString646_US(env, str);
if (fastEncoding == FAST_CP1252)
return newStringCp1252(env, str);
+ if (fastEncoding == FAST_UTF_8)
+ return newStringUTF8(env, str);
if ((*env)->EnsureLocalCapacity(env, 2) < 0)
return NULL;
len = (int)strlen(str);
@@ -848,10 +1009,12 @@
return getString8859_1Chars(env, jstr);
if (fastEncoding == FAST_646_US)
return getString646_USChars(env, jstr);
if (fastEncoding == FAST_CP1252)
return getStringCp1252Chars(env, jstr);
+ if (fastEncoding == FAST_UTF_8)
+ return getStringUTF8(env, jstr);
if ((*env)->EnsureLocalCapacity(env, 2) < 0)
return 0;
if (jnuEncodingSupported(env)) {
< prev index next >