1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/arrayOop.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 
  32 #ifdef PRODUCT
  33 #define BLOCK_COMMENT(str) /* nothing */
  34 #define STOP(error) stop(error)
  35 #else
  36 #define BLOCK_COMMENT(str) block_comment(str)
  37 #define STOP(error) block_comment(error); stop(error)
  38 #endif
  39 
  40 // Compress char[] to byte[] by compressing 16 bytes at once. Return 0 on failure.
  41 void C2_MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, Register result,
  42                                            Register tmp1, Register tmp2, Register tmp3, Register tmp4,
  43                                            FloatRegister ftmp1, FloatRegister ftmp2, FloatRegister ftmp3, Label& Ldone) {
  44   Label Lloop, Lslow;
  45   assert(UseVIS >= 3, "VIS3 is required");
  46   assert_different_registers(src, dst, cnt, tmp1, tmp2, tmp3, tmp4, result);
  47   assert_different_registers(ftmp1, ftmp2, ftmp3);
  48 
  49   // Check if cnt >= 8 (= 16 bytes)
  50   cmp(cnt, 8);
  51   br(Assembler::less, false, Assembler::pn, Lslow);
  52   delayed()->mov(cnt, result); // copy count
  53 
  54   // Check for 8-byte alignment of src and dst
  55   or3(src, dst, tmp1);
  56   andcc(tmp1, 7, G0);
  57   br(Assembler::notZero, false, Assembler::pn, Lslow);
  58   delayed()->nop();
  59 
  60   // Set mask for bshuffle instruction
  61   Register mask = tmp4;
  62   set(0x13579bdf, mask);
  63   bmask(mask, G0, G0);
  64 
  65   // Set mask to 0xff00 ff00 ff00 ff00 to check for non-latin1 characters
  66   Assembler::sethi(0xff00fc00, mask); // mask = 0x0000 0000 ff00 fc00
  67   add(mask, 0x300, mask);             // mask = 0x0000 0000 ff00 ff00
  68   sllx(mask, 32, tmp1);               // tmp1 = 0xff00 ff00 0000 0000
  69   or3(mask, tmp1, mask);              // mask = 0xff00 ff00 ff00 ff00
  70 
  71   // Load first 8 bytes
  72   ldx(src, 0, tmp1);
  73 
  74   bind(Lloop);
  75   // Load next 8 bytes
  76   ldx(src, 8, tmp2);
  77 
  78   // Check for non-latin1 character by testing if the most significant byte of a char is set.
  79   // Although we have to move the data between integer and floating point registers, this is
  80   // still faster than the corresponding VIS instructions (ford/fand/fcmpd).
  81   or3(tmp1, tmp2, tmp3);
  82   btst(tmp3, mask);
  83   // annul zeroing if branch is not taken to preserve original count
  84   brx(Assembler::notZero, true, Assembler::pn, Ldone);
  85   delayed()->mov(G0, result); // 0 - failed
  86 
  87   // Move bytes into float register
  88   movxtod(tmp1, ftmp1);
  89   movxtod(tmp2, ftmp2);
  90 
  91   // Compress by copying one byte per char from ftmp1 and ftmp2 to ftmp3
  92   bshuffle(ftmp1, ftmp2, ftmp3);
  93   stf(FloatRegisterImpl::D, ftmp3, dst, 0);
  94 
  95   // Increment addresses and decrement count
  96   inc(src, 16);
  97   inc(dst, 8);
  98   dec(cnt, 8);
  99 
 100   cmp(cnt, 8);
 101   // annul LDX if branch is not taken to prevent access past end of string
 102   br(Assembler::greaterEqual, true, Assembler::pt, Lloop);
 103   delayed()->ldx(src, 0, tmp1);
 104 
 105   // Fallback to slow version
 106   bind(Lslow);
 107 }
 108 
 109 // Compress char[] to byte[]. Return 0 on failure.
 110 void C2_MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register result, Register tmp, Label& Ldone) {
 111   Label Lloop;
 112   assert_different_registers(src, dst, cnt, tmp, result);
 113 
 114   lduh(src, 0, tmp);
 115 
 116   bind(Lloop);
 117   inc(src, sizeof(jchar));
 118   cmp(tmp, 0xff);
 119   // annul zeroing if branch is not taken to preserve original count
 120   br(Assembler::greater, true, Assembler::pn, Ldone); // don't check xcc
 121   delayed()->mov(G0, result); // 0 - failed
 122   deccc(cnt);
 123   stb(tmp, dst, 0);
 124   inc(dst);
 125   // annul LDUH if branch is not taken to prevent access past end of string
 126   br(Assembler::notZero, true, Assembler::pt, Lloop);
 127   delayed()->lduh(src, 0, tmp); // hoisted
 128 }
 129 
 130 // Inflate byte[] to char[] by inflating 16 bytes at once.
 131 void C2_MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, Register tmp,
 132                                           FloatRegister ftmp1, FloatRegister ftmp2, FloatRegister ftmp3, FloatRegister ftmp4, Label& Ldone) {
 133   Label Lloop, Lslow;
 134   assert(UseVIS >= 3, "VIS3 is required");
 135   assert_different_registers(src, dst, cnt, tmp);
 136   assert_different_registers(ftmp1, ftmp2, ftmp3, ftmp4);
 137 
 138   // Check if cnt >= 8 (= 16 bytes)
 139   cmp(cnt, 8);
 140   br(Assembler::less, false, Assembler::pn, Lslow);
 141   delayed()->nop();
 142 
 143   // Check for 8-byte alignment of src and dst
 144   or3(src, dst, tmp);
 145   andcc(tmp, 7, G0);
 146   br(Assembler::notZero, false, Assembler::pn, Lslow);
 147   // Initialize float register to zero
 148   FloatRegister zerof = ftmp4;
 149   delayed()->fzero(FloatRegisterImpl::D, zerof);
 150 
 151   // Load first 8 bytes
 152   ldf(FloatRegisterImpl::D, src, 0, ftmp1);
 153 
 154   bind(Lloop);
 155   inc(src, 8);
 156   dec(cnt, 8);
 157 
 158   // Inflate the string by interleaving each byte from the source array
 159   // with a zero byte and storing the result in the destination array.
 160   fpmerge(zerof, ftmp1->successor(), ftmp2);
 161   stf(FloatRegisterImpl::D, ftmp2, dst, 8);
 162   fpmerge(zerof, ftmp1, ftmp3);
 163   stf(FloatRegisterImpl::D, ftmp3, dst, 0);
 164 
 165   inc(dst, 16);
 166 
 167   cmp(cnt, 8);
 168   // annul LDX if branch is not taken to prevent access past end of string
 169   br(Assembler::greaterEqual, true, Assembler::pt, Lloop);
 170   delayed()->ldf(FloatRegisterImpl::D, src, 0, ftmp1);
 171 
 172   // Fallback to slow version
 173   bind(Lslow);
 174 }
 175 
 176 // Inflate byte[] to char[].
 177 void C2_MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp, Label& Ldone) {
 178   Label Loop;
 179   assert_different_registers(src, dst, cnt, tmp);
 180 
 181   ldub(src, 0, tmp);
 182   bind(Loop);
 183   inc(src);
 184   deccc(cnt);
 185   sth(tmp, dst, 0);
 186   inc(dst, sizeof(jchar));
 187   // annul LDUB if branch is not taken to prevent access past end of string
 188   br(Assembler::notZero, true, Assembler::pt, Loop);
 189   delayed()->ldub(src, 0, tmp); // hoisted
 190 }
 191 
 192 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 193                                        Register cnt1, Register cnt2,
 194                                        Register tmp1, Register tmp2,
 195                                        Register result, int ae) {
 196   Label Ldone, Lloop;
 197   assert_different_registers(str1, str2, cnt1, cnt2, tmp1, result);
 198   int stride1, stride2;
 199 
 200   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
 201   // we interchange str1 and str2 in the UL case and negate the result.
 202   // Like this, str1 is always latin1 encoded, expect for the UU case.
 203 
 204   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 205     srl(cnt2, 1, cnt2);
 206   }
 207 
 208   // See if the lengths are different, and calculate min in cnt1.
 209   // Save diff in case we need it for a tie-breaker.
 210   Label Lskip;
 211   Register diff = tmp1;
 212   subcc(cnt1, cnt2, diff);
 213   br(Assembler::greater, true, Assembler::pt, Lskip);
 214   // cnt2 is shorter, so use its count:
 215   delayed()->mov(cnt2, cnt1);
 216   bind(Lskip);
 217 
 218   // Rename registers
 219   Register limit1 = cnt1;
 220   Register limit2 = limit1;
 221   Register chr1   = result;
 222   Register chr2   = cnt2;
 223   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 224     // We need an additional register to keep track of two limits
 225     assert_different_registers(str1, str2, cnt1, cnt2, tmp1, tmp2, result);
 226     limit2 = tmp2;
 227   }
 228 
 229   // Is the minimum length zero?
 230   cmp(limit1, (int)0); // use cast to resolve overloading ambiguity
 231   br(Assembler::equal, true, Assembler::pn, Ldone);
 232   // result is difference in lengths
 233   if (ae == StrIntrinsicNode::UU) {
 234     delayed()->sra(diff, 1, result);  // Divide by 2 to get number of chars
 235   } else {
 236     delayed()->mov(diff, result);
 237   }
 238 
 239   // Load first characters
 240   if (ae == StrIntrinsicNode::LL) {
 241     stride1 = stride2 = sizeof(jbyte);
 242     ldub(str1, 0, chr1);
 243     ldub(str2, 0, chr2);
 244   } else if (ae == StrIntrinsicNode::UU) {
 245     stride1 = stride2 = sizeof(jchar);
 246     lduh(str1, 0, chr1);
 247     lduh(str2, 0, chr2);
 248   } else {
 249     stride1 = sizeof(jbyte);
 250     stride2 = sizeof(jchar);
 251     ldub(str1, 0, chr1);
 252     lduh(str2, 0, chr2);
 253   }
 254 
 255   // Compare first characters
 256   subcc(chr1, chr2, chr1);
 257   br(Assembler::notZero, false, Assembler::pt, Ldone);
 258   assert(chr1 == result, "result must be pre-placed");
 259   delayed()->nop();
 260 
 261   // Check if the strings start at same location
 262   cmp(str1, str2);
 263   brx(Assembler::equal, true, Assembler::pn, Ldone);
 264   delayed()->mov(G0, result);  // result is zero
 265 
 266   // We have no guarantee that on 64 bit the higher half of limit is 0
 267   signx(limit1);
 268 
 269   // Get limit
 270   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 271     sll(limit1, 1, limit2);
 272     subcc(limit2, stride2, chr2);
 273   }
 274   subcc(limit1, stride1, chr1);
 275   br(Assembler::zero, true, Assembler::pn, Ldone);
 276   // result is difference in lengths
 277   if (ae == StrIntrinsicNode::UU) {
 278     delayed()->sra(diff, 1, result);  // Divide by 2 to get number of chars
 279   } else {
 280     delayed()->mov(diff, result);
 281   }
 282 
 283   // Shift str1 and str2 to the end of the arrays, negate limit
 284   add(str1, limit1, str1);
 285   add(str2, limit2, str2);
 286   neg(chr1, limit1);  // limit1 = -(limit1-stride1)
 287   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 288     neg(chr2, limit2);  // limit2 = -(limit2-stride2)
 289   }
 290 
 291   // Compare the rest of the characters
 292   load_sized_value(Address(str1, limit1), chr1, (ae == StrIntrinsicNode::UU) ? 2 : 1, false);
 293 
 294   bind(Lloop);
 295   load_sized_value(Address(str2, limit2), chr2, (ae == StrIntrinsicNode::LL) ? 1 : 2, false);
 296 
 297   subcc(chr1, chr2, chr1);
 298   br(Assembler::notZero, false, Assembler::pt, Ldone);
 299   assert(chr1 == result, "result must be pre-placed");
 300   delayed()->inccc(limit1, stride1);
 301   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 302     inccc(limit2, stride2);
 303   }
 304 
 305   // annul LDUB if branch is not taken to prevent access past end of string
 306   br(Assembler::notZero, true, Assembler::pt, Lloop);
 307   delayed()->load_sized_value(Address(str1, limit1), chr1, (ae == StrIntrinsicNode::UU) ? 2 : 1, false);
 308 
 309   // If strings are equal up to min length, return the length difference.
 310   if (ae == StrIntrinsicNode::UU) {
 311     // Divide by 2 to get number of chars
 312     sra(diff, 1, result);
 313   } else {
 314     mov(diff, result);
 315   }
 316 
 317   // Otherwise, return the difference between the first mismatched chars.
 318   bind(Ldone);
 319   if(ae == StrIntrinsicNode::UL) {
 320     // Negate result (see note above)
 321     neg(result);
 322   }
 323 }
 324 
 325 void C2_MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
 326                                      Register limit, Register tmp, Register result, bool is_byte) {
 327   Label Ldone, Lloop, Lremaining;
 328   assert_different_registers(ary1, ary2, limit, tmp, result);
 329 
 330   int length_offset  = arrayOopDesc::length_offset_in_bytes();
 331   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
 332   assert(base_offset % 8 == 0, "Base offset must be 8-byte aligned");
 333 
 334   if (is_array_equ) {
 335     // return true if the same array
 336     cmp(ary1, ary2);
 337     brx(Assembler::equal, true, Assembler::pn, Ldone);
 338     delayed()->mov(1, result);  // equal
 339 
 340     br_null(ary1, true, Assembler::pn, Ldone);
 341     delayed()->clr(result);     // not equal
 342 
 343     br_null(ary2, true, Assembler::pn, Ldone);
 344     delayed()->clr(result);     // not equal
 345 
 346     // load the lengths of arrays
 347     ld(Address(ary1, length_offset), limit);
 348     ld(Address(ary2, length_offset), tmp);
 349 
 350     // return false if the two arrays are not equal length
 351     cmp(limit, tmp);
 352     br(Assembler::notEqual, true, Assembler::pn, Ldone);
 353     delayed()->clr(result);     // not equal
 354   }
 355 
 356   cmp_zero_and_br(Assembler::zero, limit, Ldone, true, Assembler::pn);
 357   delayed()->mov(1, result); // zero-length arrays are equal
 358 
 359   if (is_array_equ) {
 360     // load array addresses
 361     add(ary1, base_offset, ary1);
 362     add(ary2, base_offset, ary2);
 363     // set byte count
 364     if (!is_byte) {
 365       sll(limit, exact_log2(sizeof(jchar)), limit);
 366     }
 367   } else {
 368     // We have no guarantee that on 64 bit the higher half of limit is 0
 369     signx(limit);
 370   }
 371 
 372 #ifdef ASSERT
 373   // Sanity check for doubleword (8-byte) alignment of ary1 and ary2.
 374   // Guaranteed on 64-bit systems (see arrayOopDesc::header_size_in_bytes()).
 375   Label Laligned;
 376   or3(ary1, ary2, tmp);
 377   andcc(tmp, 7, tmp);
 378   br_null_short(tmp, Assembler::pn, Laligned);
 379   STOP("First array element is not 8-byte aligned.");
 380   should_not_reach_here();
 381   bind(Laligned);
 382 #endif
 383 
 384   // Shift ary1 and ary2 to the end of the arrays, negate limit
 385   add(ary1, limit, ary1);
 386   add(ary2, limit, ary2);
 387   neg(limit, limit);
 388 
 389   // MAIN LOOP
 390   // Load and compare array elements of size 'byte_width' until the elements are not
 391   // equal or we reached the end of the arrays. If the size of the arrays is not a
 392   // multiple of 'byte_width', we simply read over the end of the array, bail out and
 393   // compare the remaining bytes below by skipping the garbage bytes.
 394   ldx(ary1, limit, result);
 395   bind(Lloop);
 396   ldx(ary2, limit, tmp);
 397   inccc(limit, 8);
 398   // Bail out if we reached the end (but still do the comparison)
 399   br(Assembler::positive, false, Assembler::pn, Lremaining);
 400   delayed()->cmp(result, tmp);
 401   // Check equality of elements
 402   brx(Assembler::equal, false, Assembler::pt, target(Lloop));
 403   delayed()->ldx(ary1, limit, result);
 404 
 405   ba(Ldone);
 406   delayed()->clr(result); // not equal
 407 
 408   // TAIL COMPARISON
 409   // We got here because we reached the end of the arrays. 'limit' is the number of
 410   // garbage bytes we may have compared by reading over the end of the arrays. Shift
 411   // out the garbage and compare the remaining elements.
 412   bind(Lremaining);
 413   // Optimistic shortcut: elements potentially including garbage are equal
 414   brx(Assembler::equal, true, Assembler::pt, target(Ldone));
 415   delayed()->mov(1, result); // equal
 416   // Shift 'limit' bytes to the right and compare
 417   sll(limit, 3, limit); // bytes to bits
 418   srlx(result, limit, result);
 419   srlx(tmp, limit, tmp);
 420   cmp(result, tmp);
 421   clr(result);
 422   movcc(Assembler::equal, false, xcc, 1, result);
 423 
 424   bind(Ldone);
 425 }
 426 
 427 void C2_MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) {
 428 
 429   // test for negative bytes in input string of a given size
 430   // result 1 if found, 0 otherwise.
 431 
 432   Label Lcore, Ltail, Lreturn, Lcore_rpt;
 433 
 434   assert_different_registers(inp, size, t2, t3, t4, t5, result);
 435 
 436   Register i     = result;  // result used as integer index i until very end
 437   Register lmask = t2;      // t2 is aliased to lmask
 438 
 439   // INITIALIZATION
 440   // ===========================================================
 441   // initialize highbits mask -> lmask = 0x8080808080808080  (8B/64b)
 442   // compute unaligned offset -> i
 443   // compute core end index   -> t5
 444   Assembler::sethi(0x80808000, t2);   //! sethi macro fails to emit optimal
 445   add(t2, 0x80, t2);
 446   sllx(t2, 32, t3);
 447   or3(t3, t2, lmask);                 // 0x8080808080808080 -> lmask
 448   sra(size,0,size);
 449   andcc(inp, 0x7, i);                 // unaligned offset -> i
 450   br(Assembler::zero, true, Assembler::pn, Lcore); // starts 8B aligned?
 451   delayed()->add(size, -8, t5);       // (annuled) core end index -> t5
 452 
 453   // ===========================================================
 454 
 455   // UNALIGNED HEAD
 456   // ===========================================================
 457   // * unaligned head handling: grab aligned 8B containing unaligned inp(ut)
 458   // * obliterate (ignore) bytes outside string by shifting off reg ends
 459   // * compare with bitmask, short circuit return true if one or more high
 460   //   bits set.
 461   cmp(size, 0);
 462   br(Assembler::zero, true, Assembler::pn, Lreturn); // short-circuit?
 463   delayed()->mov(0,result);      // annuled so i not clobbered for following
 464   neg(i, t4);
 465   add(i, size, t5);
 466   ldx(inp, t4, t3);  // raw aligned 8B containing unaligned head -> t3
 467   mov(8, t4);
 468   sub(t4, t5, t4);
 469   sra(t4, 31, t5);
 470   andn(t4, t5, t5);
 471   add(i, t5, t4);
 472   sll(t5, 3, t5);
 473   sll(t4, 3, t4);   // # bits to shift right, left -> t5,t4
 474   srlx(t3, t5, t3);
 475   sllx(t3, t4, t3); // bytes outside string in 8B header obliterated -> t3
 476   andcc(lmask, t3, G0);
 477   brx(Assembler::notZero, true, Assembler::pn, Lreturn); // short circuit?
 478   delayed()->mov(1,result);      // annuled so i not clobbered for following
 479   add(size, -8, t5);             // core end index -> t5
 480   mov(8, t4);
 481   sub(t4, i, i);                 // # bytes examined in unalgn head (<8) -> i
 482   // ===========================================================
 483 
 484   // ALIGNED CORE
 485   // ===========================================================
 486   // * iterate index i over aligned 8B sections of core, comparing with
 487   //   bitmask, short circuit return true if one or more high bits set
 488   // t5 contains core end index/loop limit which is the index
 489   //     of the MSB of last (unaligned) 8B fully contained in the string.
 490   // inp   contains address of first byte in string/array
 491   // lmask contains 8B high bit mask for comparison
 492   // i     contains next index to be processed (adr. inp+i is on 8B boundary)
 493   bind(Lcore);
 494   cmp_and_br_short(i, t5, Assembler::greater, Assembler::pn, Ltail);
 495   bind(Lcore_rpt);
 496   ldx(inp, i, t3);
 497   andcc(t3, lmask, G0);
 498   brx(Assembler::notZero, true, Assembler::pn, Lreturn);
 499   delayed()->mov(1, result);    // annuled so i not clobbered for following
 500   add(i, 8, i);
 501   cmp_and_br_short(i, t5, Assembler::lessEqual, Assembler::pn, Lcore_rpt);
 502   // ===========================================================
 503 
 504   // ALIGNED TAIL (<8B)
 505   // ===========================================================
 506   // handle aligned tail of 7B or less as complete 8B, obliterating end of
 507   // string bytes by shifting them off end, compare what's left with bitmask
 508   // inp   contains address of first byte in string/array
 509   // lmask contains 8B high bit mask for comparison
 510   // i     contains next index to be processed (adr. inp+i is on 8B boundary)
 511   bind(Ltail);
 512   subcc(size, i, t4);   // # of remaining bytes in string -> t4
 513   // return 0 if no more remaining bytes
 514   br(Assembler::lessEqual, true, Assembler::pn, Lreturn);
 515   delayed()->mov(0, result); // annuled so i not clobbered for following
 516   ldx(inp, i, t3);       // load final 8B (aligned) containing tail -> t3
 517   mov(8, t5);
 518   sub(t5, t4, t4);
 519   mov(0, result);        // ** i clobbered at this point
 520   sll(t4, 3, t4);        // bits beyond end of string          -> t4
 521   srlx(t3, t4, t3);      // bytes beyond end now obliterated   -> t3
 522   andcc(lmask, t3, G0);
 523   movcc(Assembler::notZero, false, xcc,  1, result);
 524   bind(Lreturn);
 525 }
 526