< prev index next >

src/java.base/share/native/libjava/jni_util.c

Print this page
rev 17266 : 8181147: JNI_GetStringPlatformChars should have a fast path for UTF-8
Reviewed-by: shade, chegar, erikj

@@ -442,13 +442,12 @@
     return obj;
 }
 
 /* Optimized for char set ISO_8559_1 */
 static jstring
-newString8859_1(JNIEnv *env, const char *str)
+newSizedString8859_1(JNIEnv *env, const char *str, int len)
 {
-    int len = (int)strlen(str);
     jchar buf[512];
     jchar *str1;
     jstring result;
     int i;
 

@@ -467,10 +466,17 @@
     if (str1 != buf)
         free(str1);
     return result;
 }
 
+static jstring
+newString8859_1(JNIEnv *env, const char *str)
+{
+    int len = (int)strlen(str);
+    return newSizedString8859_1(env, str, len);
+}
+
 static const char*
 getString8859_1Chars(JNIEnv *env, jstring jstr)
 {
     int i;
     char *result;

@@ -498,10 +504,128 @@
     result[len] = 0;
     (*env)->ReleaseStringCritical(env, jstr, str);
     return result;
 }
 
+/* Real UTF-8, adapted from java.lang.StringCoding.encodeUTF8 */
+#define MIN_HIGH_SURROGATE 0xD800
+#define MAX_HIGH_SURROGATE 0xDBFF
+#define MIN_LOW_SURROGATE 0xDC00
+#define MAX_LOW_SURROGATE 0xDFFF
+#define MAX_SURROGATE MAX_LOW_SURROGATE
+#define MIN_SURROGATE MIN_HIGH_SURROGATE
+#define MIN_SUPPLEMENTARY_CODE_POINT 0x010000
+
+static jboolean isSurrogate(jchar unicode) {
+    return unicode >= MIN_SURROGATE && unicode < (MAX_SURROGATE + 1);
+}
+
+static jboolean isLowSurrogate(jchar unicode) {
+    return unicode >= MIN_LOW_SURROGATE && unicode < (MAX_LOW_SURROGATE + 1);
+}
+
+static jboolean isHighSurrogate(jchar unicode) {
+    return unicode >= MIN_HIGH_SURROGATE && unicode < (MAX_HIGH_SURROGATE + 1);
+}
+static int toCodePoint(jchar high, jchar low) {
+    // Optimized form of:
+    // return ((high - MIN_HIGH_SURROGATE) << 10)
+    //         + (low - MIN_LOW_SURROGATE)
+    //         + MIN_SUPPLEMENTARY_CODE_POINT;
+    return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
+                                   - (MIN_HIGH_SURROGATE << 10)
+                                   - MIN_LOW_SURROGATE);
+}
+
+static const char*
+getStringUTF8(JNIEnv *env, jstring jstr)
+{
+    jsize i;
+    int dp = 0;
+    char *result;
+    jsize len = (*env)->GetStringLength(env, jstr);
+    jsize size = 0;
+
+    const jchar *str = (*env)->GetStringChars(env, jstr, NULL);
+    if (str == NULL) {
+        return NULL;
+    }
+
+    for (i = 0; i < len;) {
+        jchar c = str[i++];
+        if (c <= 0x0080) {
+            size++;
+        } else if (c <= 0x0800) {
+            size += 2;
+        } else if (isSurrogate(c)) { // isSurrogate
+            int uc = -1;
+            if (isHighSurrogate(c) && i < len &&
+                isLowSurrogate(str[i])) {
+                uc = toCodePoint(c, str[i]);
+            }
+            if (uc < 0) {
+                size++;
+            } else {
+                size += 4;
+                i++;  // 2 chars
+            }
+        } else {
+            // 3 bytes, 16 bits
+            size += 3;
+        }
+    }
+
+    result = MALLOC_MIN4(size);
+
+    if (result == NULL) {
+        (*env)->ReleaseStringChars(env, jstr, str);
+        JNU_ThrowOutOfMemoryError(env, NULL);
+        return 0;
+    }
+
+    if (size == len) {
+        // ASCII fast path
+        for (i = 0; i < len;) {
+            result[dp++] = (char)str[i++];
+        }
+    } else {
+        for (i = 0; i < len;) {
+            jchar c = str[i++];
+            if (c <= 0x0080) {
+                result[dp++] = (char)c;
+            } else if (c <= 0x0800) {
+                result[dp++] = (char)(0xc0 | (c >> 6));
+                result[dp++] = (char)(0x80 | (c & 0x3f));
+            } else if (isSurrogate(c)) { // isSurrogate
+                int uc = -1;
+                if (isHighSurrogate(c) && i < len &&
+                    isLowSurrogate(str[i])) {
+                    uc = toCodePoint(c, str[i]);
+                }
+                if (uc < 0) {
+                    result[dp++] = '?';
+                } else {
+                    result[dp++] = (char)(0xf0 | ((uc >> 18)));
+                    result[dp++] = (char)(0x80 | ((uc >> 12) & 0x3f));
+                    result[dp++] = (char)(0x80 | ((uc >>  6) & 0x3f));
+                    result[dp++] = (char)(0x80 | (uc & 0x3f));
+                    i++;  // 2 chars
+                }
+            } else {
+                // 3 bytes, 16 bits
+                result[dp++] = (char)(0xe0 | ((c >> 12)));
+                result[dp++] = (char)(0x80 | ((c >>  6) & 0x3f));
+                result[dp++] = (char)(0x80 | (c & 0x3f));
+            }
+        }
+    }
+
+    result[size] = 0;
+    (*env)->ReleaseStringChars(env, jstr, str);
+    return result;
+}
+
 
 /* Optimized for char set ISO646-US (us-ascii) */
 static jstring
 newString646_US(JNIEnv *env, const char *str)
 {

@@ -667,16 +791,46 @@
 static int fastEncoding = NO_ENCODING_YET;
 static jstring jnuEncoding = NULL;
 
 /* Cached method IDs */
 static jmethodID String_init_ID;        /* String(byte[], enc) */
+static jmethodID String_init_coder_ID;  /* String(byte[], byte) */
 static jmethodID String_getBytes_ID;    /* String.getBytes(enc) */
 
-int getFastEncoding() {
-    return fastEncoding;
+/* Optimized for char set UTF-8 */
+static jstring
+newStringUTF8(JNIEnv *env, const char *str)
+{
+    jboolean isAscii = JNI_TRUE;
+    jstring result;
+    jbyteArray hab = NULL;
+    int len = 0;
+    char b;
+    for (b = str[len]; b != '\0'; len++, b = str[len]) {
+        if (isAscii && b & 0x80) {
+            isAscii = JNI_FALSE;
+        }
+    }
+
+    if (isAscii) {
+        return newSizedString8859_1(env, str, len);
+    }
+
+    hab = (*env)->NewByteArray(env, len);
+    if (hab != 0) {
+        jclass strClazz = JNU_ClassString(env);
+        CHECK_NULL_RETURN(strClazz, 0);
+        (*env)->SetByteArrayRegion(env, hab, 0, len, (jbyte *)str);
+        result = (*env)->NewObject(env, strClazz,
+                                       String_init_ID, hab, jnuEncoding);
+        (*env)->DeleteLocalRef(env, hab);
+        return result;
+    }
+    return NULL;
 }
 
+
 /* Initialize the fast encoding.  If the "sun.jnu.encoding" property
  * has not yet been set, we leave fastEncoding == NO_ENCODING_YET.
  */
 void
 initializeEncoding(JNIEnv *env)

@@ -716,21 +870,24 @@
             *   "en_UK" locale -> "ISO8859-1"                   (on 2.6)
             */
                     if ((strcmp(encname, "8859_1") == 0) ||
                         (strcmp(encname, "ISO8859-1") == 0) ||
                         (strcmp(encname, "ISO8859_1") == 0) ||
-                        (strcmp(encname, "ISO-8859-1") == 0))
+                        (strcmp(encname, "ISO-8859-1") == 0)) {
                         fastEncoding = FAST_8859_1;
-                    else if (strcmp(encname, "ISO646-US") == 0)
+                    } else if (strcmp(encname, "UTF-8") == 0) {
+                        fastEncoding = FAST_UTF_8;
+                        jnuEncoding = (jstring)(*env)->NewGlobalRef(env, enc);
+                    } else if (strcmp(encname, "ISO646-US") == 0) {
                         fastEncoding = FAST_646_US;
-                    else if (strcmp(encname, "Cp1252") == 0 ||
+                    } else if (strcmp(encname, "Cp1252") == 0 ||
                              /* This is a temporary fix until we move */
                              /* to wide character versions of all Windows */
                              /* calls. */
-                             strcmp(encname, "utf-16le") == 0)
+                             strcmp(encname, "utf-16le") == 0) {
                         fastEncoding = FAST_CP1252;
-                    else {
+                    } else {
                         fastEncoding = NO_FAST_ENCODING;
                         jnuEncoding = (jstring)(*env)->NewGlobalRef(env, enc);
                     }
                     (*env)->ReleaseStringUTFChars(env, enc, encname);
                 }

@@ -748,10 +905,12 @@
     String_getBytes_ID = (*env)->GetMethodID(env, strClazz,
                                              "getBytes", "(Ljava/lang/String;)[B");
     CHECK_NULL(String_getBytes_ID);
     String_init_ID = (*env)->GetMethodID(env, strClazz,
                                          "<init>", "([BLjava/lang/String;)V");
+    String_init_coder_ID = (*env)->GetMethodID(env, strClazz,
+                                         "<init>", "([BB)V");
 }
 
 static jboolean isJNUEncodingSupported = JNI_FALSE;
 static jboolean jnuEncodingSupported(JNIEnv *env) {
     jboolean exe;

@@ -790,10 +949,12 @@
         return newString8859_1(env, str);
     if (fastEncoding == FAST_646_US)
         return newString646_US(env, str);
     if (fastEncoding == FAST_CP1252)
         return newStringCp1252(env, str);
+    if (fastEncoding == FAST_UTF_8)
+        return newStringUTF8(env, str);
 
     if ((*env)->EnsureLocalCapacity(env, 2) < 0)
         return NULL;
 
     len = (int)strlen(str);

@@ -848,10 +1009,12 @@
         return getString8859_1Chars(env, jstr);
     if (fastEncoding == FAST_646_US)
         return getString646_USChars(env, jstr);
     if (fastEncoding == FAST_CP1252)
         return getStringCp1252Chars(env, jstr);
+    if (fastEncoding == FAST_UTF_8)
+        return getStringUTF8(env, jstr);
 
     if ((*env)->EnsureLocalCapacity(env, 2) < 0)
         return 0;
 
     if (jnuEncodingSupported(env)) {
< prev index next >