1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/arrayOop.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 32 #ifdef PRODUCT 33 #define BLOCK_COMMENT(str) /* nothing */ 34 #define STOP(error) stop(error) 35 #else 36 #define BLOCK_COMMENT(str) block_comment(str) 37 #define STOP(error) block_comment(error); stop(error) 38 #endif 39 40 // Compress char[] to byte[] by compressing 16 bytes at once. Return 0 on failure. 41 void C2_MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, Register result, 42 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 43 FloatRegister ftmp1, FloatRegister ftmp2, FloatRegister ftmp3, Label& Ldone) { 44 Label Lloop, Lslow; 45 assert(UseVIS >= 3, "VIS3 is required"); 46 assert_different_registers(src, dst, cnt, tmp1, tmp2, tmp3, tmp4, result); 47 assert_different_registers(ftmp1, ftmp2, ftmp3); 48 49 // Check if cnt >= 8 (= 16 bytes) 50 cmp(cnt, 8); 51 br(Assembler::less, false, Assembler::pn, Lslow); 52 delayed()->mov(cnt, result); // copy count 53 54 // Check for 8-byte alignment of src and dst 55 or3(src, dst, tmp1); 56 andcc(tmp1, 7, G0); 57 br(Assembler::notZero, false, Assembler::pn, Lslow); 58 delayed()->nop(); 59 60 // Set mask for bshuffle instruction 61 Register mask = tmp4; 62 set(0x13579bdf, mask); 63 bmask(mask, G0, G0); 64 65 // Set mask to 0xff00 ff00 ff00 ff00 to check for non-latin1 characters 66 Assembler::sethi(0xff00fc00, mask); // mask = 0x0000 0000 ff00 fc00 67 add(mask, 0x300, mask); // mask = 0x0000 0000 ff00 ff00 68 sllx(mask, 32, tmp1); // tmp1 = 0xff00 ff00 0000 0000 69 or3(mask, tmp1, mask); // mask = 0xff00 ff00 ff00 ff00 70 71 // Load first 8 bytes 72 ldx(src, 0, tmp1); 73 74 bind(Lloop); 75 // Load next 8 bytes 76 ldx(src, 8, tmp2); 77 78 // Check for non-latin1 character by testing if the most significant byte of a char is set. 79 // Although we have to move the data between integer and floating point registers, this is 80 // still faster than the corresponding VIS instructions (ford/fand/fcmpd). 81 or3(tmp1, tmp2, tmp3); 82 btst(tmp3, mask); 83 // annul zeroing if branch is not taken to preserve original count 84 brx(Assembler::notZero, true, Assembler::pn, Ldone); 85 delayed()->mov(G0, result); // 0 - failed 86 87 // Move bytes into float register 88 movxtod(tmp1, ftmp1); 89 movxtod(tmp2, ftmp2); 90 91 // Compress by copying one byte per char from ftmp1 and ftmp2 to ftmp3 92 bshuffle(ftmp1, ftmp2, ftmp3); 93 stf(FloatRegisterImpl::D, ftmp3, dst, 0); 94 95 // Increment addresses and decrement count 96 inc(src, 16); 97 inc(dst, 8); 98 dec(cnt, 8); 99 100 cmp(cnt, 8); 101 // annul LDX if branch is not taken to prevent access past end of string 102 br(Assembler::greaterEqual, true, Assembler::pt, Lloop); 103 delayed()->ldx(src, 0, tmp1); 104 105 // Fallback to slow version 106 bind(Lslow); 107 } 108 109 // Compress char[] to byte[]. Return 0 on failure. 110 void C2_MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register result, Register tmp, Label& Ldone) { 111 Label Lloop; 112 assert_different_registers(src, dst, cnt, tmp, result); 113 114 lduh(src, 0, tmp); 115 116 bind(Lloop); 117 inc(src, sizeof(jchar)); 118 cmp(tmp, 0xff); 119 // annul zeroing if branch is not taken to preserve original count 120 br(Assembler::greater, true, Assembler::pn, Ldone); // don't check xcc 121 delayed()->mov(G0, result); // 0 - failed 122 deccc(cnt); 123 stb(tmp, dst, 0); 124 inc(dst); 125 // annul LDUH if branch is not taken to prevent access past end of string 126 br(Assembler::notZero, true, Assembler::pt, Lloop); 127 delayed()->lduh(src, 0, tmp); // hoisted 128 } 129 130 // Inflate byte[] to char[] by inflating 16 bytes at once. 131 void C2_MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, Register tmp, 132 FloatRegister ftmp1, FloatRegister ftmp2, FloatRegister ftmp3, FloatRegister ftmp4, Label& Ldone) { 133 Label Lloop, Lslow; 134 assert(UseVIS >= 3, "VIS3 is required"); 135 assert_different_registers(src, dst, cnt, tmp); 136 assert_different_registers(ftmp1, ftmp2, ftmp3, ftmp4); 137 138 // Check if cnt >= 8 (= 16 bytes) 139 cmp(cnt, 8); 140 br(Assembler::less, false, Assembler::pn, Lslow); 141 delayed()->nop(); 142 143 // Check for 8-byte alignment of src and dst 144 or3(src, dst, tmp); 145 andcc(tmp, 7, G0); 146 br(Assembler::notZero, false, Assembler::pn, Lslow); 147 // Initialize float register to zero 148 FloatRegister zerof = ftmp4; 149 delayed()->fzero(FloatRegisterImpl::D, zerof); 150 151 // Load first 8 bytes 152 ldf(FloatRegisterImpl::D, src, 0, ftmp1); 153 154 bind(Lloop); 155 inc(src, 8); 156 dec(cnt, 8); 157 158 // Inflate the string by interleaving each byte from the source array 159 // with a zero byte and storing the result in the destination array. 160 fpmerge(zerof, ftmp1->successor(), ftmp2); 161 stf(FloatRegisterImpl::D, ftmp2, dst, 8); 162 fpmerge(zerof, ftmp1, ftmp3); 163 stf(FloatRegisterImpl::D, ftmp3, dst, 0); 164 165 inc(dst, 16); 166 167 cmp(cnt, 8); 168 // annul LDX if branch is not taken to prevent access past end of string 169 br(Assembler::greaterEqual, true, Assembler::pt, Lloop); 170 delayed()->ldf(FloatRegisterImpl::D, src, 0, ftmp1); 171 172 // Fallback to slow version 173 bind(Lslow); 174 } 175 176 // Inflate byte[] to char[]. 177 void C2_MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp, Label& Ldone) { 178 Label Loop; 179 assert_different_registers(src, dst, cnt, tmp); 180 181 ldub(src, 0, tmp); 182 bind(Loop); 183 inc(src); 184 deccc(cnt); 185 sth(tmp, dst, 0); 186 inc(dst, sizeof(jchar)); 187 // annul LDUB if branch is not taken to prevent access past end of string 188 br(Assembler::notZero, true, Assembler::pt, Loop); 189 delayed()->ldub(src, 0, tmp); // hoisted 190 } 191 192 void C2_MacroAssembler::string_compare(Register str1, Register str2, 193 Register cnt1, Register cnt2, 194 Register tmp1, Register tmp2, 195 Register result, int ae) { 196 Label Ldone, Lloop; 197 assert_different_registers(str1, str2, cnt1, cnt2, tmp1, result); 198 int stride1, stride2; 199 200 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 201 // we interchange str1 and str2 in the UL case and negate the result. 202 // Like this, str1 is always latin1 encoded, expect for the UU case. 203 204 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 205 srl(cnt2, 1, cnt2); 206 } 207 208 // See if the lengths are different, and calculate min in cnt1. 209 // Save diff in case we need it for a tie-breaker. 210 Label Lskip; 211 Register diff = tmp1; 212 subcc(cnt1, cnt2, diff); 213 br(Assembler::greater, true, Assembler::pt, Lskip); 214 // cnt2 is shorter, so use its count: 215 delayed()->mov(cnt2, cnt1); 216 bind(Lskip); 217 218 // Rename registers 219 Register limit1 = cnt1; 220 Register limit2 = limit1; 221 Register chr1 = result; 222 Register chr2 = cnt2; 223 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 224 // We need an additional register to keep track of two limits 225 assert_different_registers(str1, str2, cnt1, cnt2, tmp1, tmp2, result); 226 limit2 = tmp2; 227 } 228 229 // Is the minimum length zero? 230 cmp(limit1, (int)0); // use cast to resolve overloading ambiguity 231 br(Assembler::equal, true, Assembler::pn, Ldone); 232 // result is difference in lengths 233 if (ae == StrIntrinsicNode::UU) { 234 delayed()->sra(diff, 1, result); // Divide by 2 to get number of chars 235 } else { 236 delayed()->mov(diff, result); 237 } 238 239 // Load first characters 240 if (ae == StrIntrinsicNode::LL) { 241 stride1 = stride2 = sizeof(jbyte); 242 ldub(str1, 0, chr1); 243 ldub(str2, 0, chr2); 244 } else if (ae == StrIntrinsicNode::UU) { 245 stride1 = stride2 = sizeof(jchar); 246 lduh(str1, 0, chr1); 247 lduh(str2, 0, chr2); 248 } else { 249 stride1 = sizeof(jbyte); 250 stride2 = sizeof(jchar); 251 ldub(str1, 0, chr1); 252 lduh(str2, 0, chr2); 253 } 254 255 // Compare first characters 256 subcc(chr1, chr2, chr1); 257 br(Assembler::notZero, false, Assembler::pt, Ldone); 258 assert(chr1 == result, "result must be pre-placed"); 259 delayed()->nop(); 260 261 // Check if the strings start at same location 262 cmp(str1, str2); 263 brx(Assembler::equal, true, Assembler::pn, Ldone); 264 delayed()->mov(G0, result); // result is zero 265 266 // We have no guarantee that on 64 bit the higher half of limit is 0 267 signx(limit1); 268 269 // Get limit 270 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 271 sll(limit1, 1, limit2); 272 subcc(limit2, stride2, chr2); 273 } 274 subcc(limit1, stride1, chr1); 275 br(Assembler::zero, true, Assembler::pn, Ldone); 276 // result is difference in lengths 277 if (ae == StrIntrinsicNode::UU) { 278 delayed()->sra(diff, 1, result); // Divide by 2 to get number of chars 279 } else { 280 delayed()->mov(diff, result); 281 } 282 283 // Shift str1 and str2 to the end of the arrays, negate limit 284 add(str1, limit1, str1); 285 add(str2, limit2, str2); 286 neg(chr1, limit1); // limit1 = -(limit1-stride1) 287 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 288 neg(chr2, limit2); // limit2 = -(limit2-stride2) 289 } 290 291 // Compare the rest of the characters 292 load_sized_value(Address(str1, limit1), chr1, (ae == StrIntrinsicNode::UU) ? 2 : 1, false); 293 294 bind(Lloop); 295 load_sized_value(Address(str2, limit2), chr2, (ae == StrIntrinsicNode::LL) ? 1 : 2, false); 296 297 subcc(chr1, chr2, chr1); 298 br(Assembler::notZero, false, Assembler::pt, Ldone); 299 assert(chr1 == result, "result must be pre-placed"); 300 delayed()->inccc(limit1, stride1); 301 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 302 inccc(limit2, stride2); 303 } 304 305 // annul LDUB if branch is not taken to prevent access past end of string 306 br(Assembler::notZero, true, Assembler::pt, Lloop); 307 delayed()->load_sized_value(Address(str1, limit1), chr1, (ae == StrIntrinsicNode::UU) ? 2 : 1, false); 308 309 // If strings are equal up to min length, return the length difference. 310 if (ae == StrIntrinsicNode::UU) { 311 // Divide by 2 to get number of chars 312 sra(diff, 1, result); 313 } else { 314 mov(diff, result); 315 } 316 317 // Otherwise, return the difference between the first mismatched chars. 318 bind(Ldone); 319 if(ae == StrIntrinsicNode::UL) { 320 // Negate result (see note above) 321 neg(result); 322 } 323 } 324 325 void C2_MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 326 Register limit, Register tmp, Register result, bool is_byte) { 327 Label Ldone, Lloop, Lremaining; 328 assert_different_registers(ary1, ary2, limit, tmp, result); 329 330 int length_offset = arrayOopDesc::length_offset_in_bytes(); 331 int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 332 assert(base_offset % 8 == 0, "Base offset must be 8-byte aligned"); 333 334 if (is_array_equ) { 335 // return true if the same array 336 cmp(ary1, ary2); 337 brx(Assembler::equal, true, Assembler::pn, Ldone); 338 delayed()->mov(1, result); // equal 339 340 br_null(ary1, true, Assembler::pn, Ldone); 341 delayed()->clr(result); // not equal 342 343 br_null(ary2, true, Assembler::pn, Ldone); 344 delayed()->clr(result); // not equal 345 346 // load the lengths of arrays 347 ld(Address(ary1, length_offset), limit); 348 ld(Address(ary2, length_offset), tmp); 349 350 // return false if the two arrays are not equal length 351 cmp(limit, tmp); 352 br(Assembler::notEqual, true, Assembler::pn, Ldone); 353 delayed()->clr(result); // not equal 354 } 355 356 cmp_zero_and_br(Assembler::zero, limit, Ldone, true, Assembler::pn); 357 delayed()->mov(1, result); // zero-length arrays are equal 358 359 if (is_array_equ) { 360 // load array addresses 361 add(ary1, base_offset, ary1); 362 add(ary2, base_offset, ary2); 363 // set byte count 364 if (!is_byte) { 365 sll(limit, exact_log2(sizeof(jchar)), limit); 366 } 367 } else { 368 // We have no guarantee that on 64 bit the higher half of limit is 0 369 signx(limit); 370 } 371 372 #ifdef ASSERT 373 // Sanity check for doubleword (8-byte) alignment of ary1 and ary2. 374 // Guaranteed on 64-bit systems (see arrayOopDesc::header_size_in_bytes()). 375 Label Laligned; 376 or3(ary1, ary2, tmp); 377 andcc(tmp, 7, tmp); 378 br_null_short(tmp, Assembler::pn, Laligned); 379 STOP("First array element is not 8-byte aligned."); 380 should_not_reach_here(); 381 bind(Laligned); 382 #endif 383 384 // Shift ary1 and ary2 to the end of the arrays, negate limit 385 add(ary1, limit, ary1); 386 add(ary2, limit, ary2); 387 neg(limit, limit); 388 389 // MAIN LOOP 390 // Load and compare array elements of size 'byte_width' until the elements are not 391 // equal or we reached the end of the arrays. If the size of the arrays is not a 392 // multiple of 'byte_width', we simply read over the end of the array, bail out and 393 // compare the remaining bytes below by skipping the garbage bytes. 394 ldx(ary1, limit, result); 395 bind(Lloop); 396 ldx(ary2, limit, tmp); 397 inccc(limit, 8); 398 // Bail out if we reached the end (but still do the comparison) 399 br(Assembler::positive, false, Assembler::pn, Lremaining); 400 delayed()->cmp(result, tmp); 401 // Check equality of elements 402 brx(Assembler::equal, false, Assembler::pt, target(Lloop)); 403 delayed()->ldx(ary1, limit, result); 404 405 ba(Ldone); 406 delayed()->clr(result); // not equal 407 408 // TAIL COMPARISON 409 // We got here because we reached the end of the arrays. 'limit' is the number of 410 // garbage bytes we may have compared by reading over the end of the arrays. Shift 411 // out the garbage and compare the remaining elements. 412 bind(Lremaining); 413 // Optimistic shortcut: elements potentially including garbage are equal 414 brx(Assembler::equal, true, Assembler::pt, target(Ldone)); 415 delayed()->mov(1, result); // equal 416 // Shift 'limit' bytes to the right and compare 417 sll(limit, 3, limit); // bytes to bits 418 srlx(result, limit, result); 419 srlx(tmp, limit, tmp); 420 cmp(result, tmp); 421 clr(result); 422 movcc(Assembler::equal, false, xcc, 1, result); 423 424 bind(Ldone); 425 } 426 427 void C2_MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) { 428 429 // test for negative bytes in input string of a given size 430 // result 1 if found, 0 otherwise. 431 432 Label Lcore, Ltail, Lreturn, Lcore_rpt; 433 434 assert_different_registers(inp, size, t2, t3, t4, t5, result); 435 436 Register i = result; // result used as integer index i until very end 437 Register lmask = t2; // t2 is aliased to lmask 438 439 // INITIALIZATION 440 // =========================================================== 441 // initialize highbits mask -> lmask = 0x8080808080808080 (8B/64b) 442 // compute unaligned offset -> i 443 // compute core end index -> t5 444 Assembler::sethi(0x80808000, t2); //! sethi macro fails to emit optimal 445 add(t2, 0x80, t2); 446 sllx(t2, 32, t3); 447 or3(t3, t2, lmask); // 0x8080808080808080 -> lmask 448 sra(size,0,size); 449 andcc(inp, 0x7, i); // unaligned offset -> i 450 br(Assembler::zero, true, Assembler::pn, Lcore); // starts 8B aligned? 451 delayed()->add(size, -8, t5); // (annuled) core end index -> t5 452 453 // =========================================================== 454 455 // UNALIGNED HEAD 456 // =========================================================== 457 // * unaligned head handling: grab aligned 8B containing unaligned inp(ut) 458 // * obliterate (ignore) bytes outside string by shifting off reg ends 459 // * compare with bitmask, short circuit return true if one or more high 460 // bits set. 461 cmp(size, 0); 462 br(Assembler::zero, true, Assembler::pn, Lreturn); // short-circuit? 463 delayed()->mov(0,result); // annuled so i not clobbered for following 464 neg(i, t4); 465 add(i, size, t5); 466 ldx(inp, t4, t3); // raw aligned 8B containing unaligned head -> t3 467 mov(8, t4); 468 sub(t4, t5, t4); 469 sra(t4, 31, t5); 470 andn(t4, t5, t5); 471 add(i, t5, t4); 472 sll(t5, 3, t5); 473 sll(t4, 3, t4); // # bits to shift right, left -> t5,t4 474 srlx(t3, t5, t3); 475 sllx(t3, t4, t3); // bytes outside string in 8B header obliterated -> t3 476 andcc(lmask, t3, G0); 477 brx(Assembler::notZero, true, Assembler::pn, Lreturn); // short circuit? 478 delayed()->mov(1,result); // annuled so i not clobbered for following 479 add(size, -8, t5); // core end index -> t5 480 mov(8, t4); 481 sub(t4, i, i); // # bytes examined in unalgn head (<8) -> i 482 // =========================================================== 483 484 // ALIGNED CORE 485 // =========================================================== 486 // * iterate index i over aligned 8B sections of core, comparing with 487 // bitmask, short circuit return true if one or more high bits set 488 // t5 contains core end index/loop limit which is the index 489 // of the MSB of last (unaligned) 8B fully contained in the string. 490 // inp contains address of first byte in string/array 491 // lmask contains 8B high bit mask for comparison 492 // i contains next index to be processed (adr. inp+i is on 8B boundary) 493 bind(Lcore); 494 cmp_and_br_short(i, t5, Assembler::greater, Assembler::pn, Ltail); 495 bind(Lcore_rpt); 496 ldx(inp, i, t3); 497 andcc(t3, lmask, G0); 498 brx(Assembler::notZero, true, Assembler::pn, Lreturn); 499 delayed()->mov(1, result); // annuled so i not clobbered for following 500 add(i, 8, i); 501 cmp_and_br_short(i, t5, Assembler::lessEqual, Assembler::pn, Lcore_rpt); 502 // =========================================================== 503 504 // ALIGNED TAIL (<8B) 505 // =========================================================== 506 // handle aligned tail of 7B or less as complete 8B, obliterating end of 507 // string bytes by shifting them off end, compare what's left with bitmask 508 // inp contains address of first byte in string/array 509 // lmask contains 8B high bit mask for comparison 510 // i contains next index to be processed (adr. inp+i is on 8B boundary) 511 bind(Ltail); 512 subcc(size, i, t4); // # of remaining bytes in string -> t4 513 // return 0 if no more remaining bytes 514 br(Assembler::lessEqual, true, Assembler::pn, Lreturn); 515 delayed()->mov(0, result); // annuled so i not clobbered for following 516 ldx(inp, i, t3); // load final 8B (aligned) containing tail -> t3 517 mov(8, t5); 518 sub(t5, t4, t4); 519 mov(0, result); // ** i clobbered at this point 520 sll(t4, 3, t4); // bits beyond end of string -> t4 521 srlx(t3, t4, t3); // bytes beyond end now obliterated -> t3 522 andcc(lmask, t3, G0); 523 movcc(Assembler::notZero, false, xcc, 1, result); 524 bind(Lreturn); 525 } 526