384 c = (buffer[i] & 0xF) << 12; 385 i += 2; 386 if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) { 387 c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F); 388 if (version_leq_47 || c >= 0x800) { 389 break; 390 } 391 } 392 return false; 393 } // end of switch 394 } // end of for 395 return true; 396 } 397 398 //------------------------------------------------------------------------------------- 399 400 bool UNICODE::is_latin1(jchar c) { 401 return (c <= 0x00FF); 402 } 403 404 bool UNICODE::is_latin1(jchar* base, int length) { 405 for (int index = 0; index < length; index++) { 406 if (base[index] > 0x00FF) { 407 return false; 408 } 409 } 410 return true; 411 } 412 413 int UNICODE::utf8_size(jchar c) { 414 if ((0x0001 <= c) && (c <= 0x007F)) { 415 // ASCII character 416 return 1; 417 } else if (c <= 0x07FF) { 418 return 2; 419 } else { 420 return 3; 421 } 422 } 423 424 int UNICODE::utf8_size(jbyte c) { 425 if (c >= 0x01) { 426 // ASCII character. Check is equivalent to 427 // (0x01 <= c) && (c <= 0x7F) because c is signed. 428 return 1; 429 } else { 430 // Non-ASCII character or 0x00 which needs to be 431 // two-byte encoded as 0xC080 in modified UTF-8. 432 return 2; 433 } 434 } 435 436 template<typename T> 437 int UNICODE::utf8_length(T* base, int length) { 438 int result = 0; 439 for (int index = 0; index < length; index++) { 440 T c = base[index]; 441 result += utf8_size(c); 442 } 443 return result; 444 } 445 446 template<typename T> 447 char* UNICODE::as_utf8(T* base, int& length) { 448 int utf8_len = utf8_length(base, length); 449 u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); 450 char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); 451 assert((int) strlen(result) == utf8_len, "length prediction must be correct"); 452 // Set string length to uft8 length 453 length = utf8_len; 454 return (char*) result; 455 } 456 457 char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) { 458 u_char* p = (u_char*)buf; 459 for (int index = 0; index < length; index++) { 460 jchar c = base[index]; 461 buflen -= utf8_size(c); 462 if (buflen <= 0) break; // string is truncated 463 p = utf8_write(p, c); 464 } 465 *p = '\0'; 466 return buf; 467 } 468 469 char* UNICODE::as_utf8(jbyte* base, int length, char* buf, int buflen) { 470 u_char* p = (u_char*)buf; 471 u_char* end = (u_char*)buf + buflen; 472 for (int index = 0; index < length; index++) { 473 jbyte c = base[index]; 474 int sz = utf8_size(c); 475 buflen -= sz; 476 if (buflen <= 0) break; // string is truncated 477 if (sz == 1) { 478 // Copy ASCII characters (UTF-8 is ASCII compatible) 479 *p++ = c; 480 } else { 481 // Non-ASCII character or 0x00 which should 482 // be encoded as 0xC080 in "modified" UTF8. 483 p = utf8_write(p, ((jchar) c) & 0xff); 484 } 485 } 486 *p = '\0'; 487 return buf; 488 } 489 490 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { 491 for(int index = 0; index < length; index++) { 492 utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); 493 } 494 *utf8_buffer = '\0'; 495 } 496 497 // returns the quoted ascii length of a unicode string 498 template<typename T> 499 int UNICODE::quoted_ascii_length(T* base, int length) { 500 int result = 0; 501 for (int i = 0; i < length; i++) { 502 T c = base[i]; 503 if (c >= 32 && c < 127) { 504 result++; 505 } else { 506 result += 6; 507 } 508 } 509 return result; 510 } 511 512 // converts a unicode string to quoted ascii 513 template<typename T> 514 void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { 515 char* p = buf; 516 char* end = buf + buflen; 517 for (int index = 0; index < length; index++) { 518 T c = base[index]; 519 if (c >= 32 && c < 127) { 520 if (p + 1 >= end) break; // string is truncated 521 *p++ = (char)c; 522 } else { 523 if (p + 6 >= end) break; // string is truncated 524 sprintf(p, "\\u%04x", c); 525 p += 6; 526 } 527 } 528 *p = '\0'; 529 } 530 531 // Explicit instantiation for all supported types. 532 template int UNICODE::utf8_length(jbyte* base, int length); 533 template int UNICODE::utf8_length(jchar* base, int length); 534 template char* UNICODE::as_utf8(jbyte* base, int& length); 535 template char* UNICODE::as_utf8(jchar* base, int& length); 536 template int UNICODE::quoted_ascii_length<jbyte>(jbyte* base, int length); 537 template int UNICODE::quoted_ascii_length<jchar>(jchar* base, int length); 538 template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen); 539 template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen); | 384 c = (buffer[i] & 0xF) << 12; 385 i += 2; 386 if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) { 387 c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F); 388 if (version_leq_47 || c >= 0x800) { 389 break; 390 } 391 } 392 return false; 393 } // end of switch 394 } // end of for 395 return true; 396 } 397 398 //------------------------------------------------------------------------------------- 399 400 bool UNICODE::is_latin1(jchar c) { 401 return (c <= 0x00FF); 402 } 403 404 bool UNICODE::is_latin1(const jchar* base, int length) { 405 for (int index = 0; index < length; index++) { 406 if (base[index] > 0x00FF) { 407 return false; 408 } 409 } 410 return true; 411 } 412 413 int UNICODE::utf8_size(jchar c) { 414 if ((0x0001 <= c) && (c <= 0x007F)) { 415 // ASCII character 416 return 1; 417 } else if (c <= 0x07FF) { 418 return 2; 419 } else { 420 return 3; 421 } 422 } 423 424 int UNICODE::utf8_size(jbyte c) { 425 if (c >= 0x01) { 426 // ASCII character. Check is equivalent to 427 // (0x01 <= c) && (c <= 0x7F) because c is signed. 428 return 1; 429 } else { 430 // Non-ASCII character or 0x00 which needs to be 431 // two-byte encoded as 0xC080 in modified UTF-8. 432 return 2; 433 } 434 } 435 436 template<typename T> 437 int UNICODE::utf8_length(const T* base, int length) { 438 int result = 0; 439 for (int index = 0; index < length; index++) { 440 T c = base[index]; 441 result += utf8_size(c); 442 } 443 return result; 444 } 445 446 template<typename T> 447 char* UNICODE::as_utf8(const T* base, int& length) { 448 int utf8_len = utf8_length(base, length); 449 u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); 450 char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); 451 assert((int) strlen(result) == utf8_len, "length prediction must be correct"); 452 // Set string length to uft8 length 453 length = utf8_len; 454 return (char*) result; 455 } 456 457 char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) { 458 u_char* p = (u_char*)buf; 459 for (int index = 0; index < length; index++) { 460 jchar c = base[index]; 461 buflen -= utf8_size(c); 462 if (buflen <= 0) break; // string is truncated 463 p = utf8_write(p, c); 464 } 465 *p = '\0'; 466 return buf; 467 } 468 469 char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) { 470 u_char* p = (u_char*)buf; 471 u_char* end = (u_char*)buf + buflen; 472 for (int index = 0; index < length; index++) { 473 jbyte c = base[index]; 474 int sz = utf8_size(c); 475 buflen -= sz; 476 if (buflen <= 0) break; // string is truncated 477 if (sz == 1) { 478 // Copy ASCII characters (UTF-8 is ASCII compatible) 479 *p++ = c; 480 } else { 481 // Non-ASCII character or 0x00 which should 482 // be encoded as 0xC080 in "modified" UTF8. 483 p = utf8_write(p, ((jchar) c) & 0xff); 484 } 485 } 486 *p = '\0'; 487 return buf; 488 } 489 490 void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { 491 for(int index = 0; index < length; index++) { 492 utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); 493 } 494 *utf8_buffer = '\0'; 495 } 496 497 // returns the quoted ascii length of a unicode string 498 template<typename T> 499 int UNICODE::quoted_ascii_length(const T* base, int length) { 500 int result = 0; 501 for (int i = 0; i < length; i++) { 502 T c = base[i]; 503 if (c >= 32 && c < 127) { 504 result++; 505 } else { 506 result += 6; 507 } 508 } 509 return result; 510 } 511 512 // converts a unicode string to quoted ascii 513 template<typename T> 514 void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { 515 char* p = buf; 516 char* end = buf + buflen; 517 for (int index = 0; index < length; index++) { 518 T c = base[index]; 519 if (c >= 32 && c < 127) { 520 if (p + 1 >= end) break; // string is truncated 521 *p++ = (char)c; 522 } else { 523 if (p + 6 >= end) break; // string is truncated 524 sprintf(p, "\\u%04x", c); 525 p += 6; 526 } 527 } 528 *p = '\0'; 529 } 530 531 // Explicit instantiation for all supported types. 532 template int UNICODE::utf8_length(const jbyte* base, int length); 533 template int UNICODE::utf8_length(const jchar* base, int length); 534 template char* UNICODE::as_utf8(const jbyte* base, int& length); 535 template char* UNICODE::as_utf8(const jchar* base, int& length); 536 template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length); 537 template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length); 538 template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen); 539 template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen); |