1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(k1, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask() {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(k1, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 237   assert(tmpReg == rax, "");
 238   assert(scrReg == rdx, "");
 239   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 240 
 241   if (RTMRetryCount > 0) {
 242     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 243     bind(L_rtm_retry);
 244   }
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 246   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 247   jcc(Assembler::notZero, IsInflated);
 248 
 249   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 250     Label L_noincrement;
 251     if (RTMTotalCountIncrRate > 1) {
 252       // tmpReg, scrReg and flags are killed
 253       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 254     }
 255     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 256     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 257     bind(L_noincrement);
 258   }
 259   xbegin(L_on_abort);
 260   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 261   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 262   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 263   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 264 
 265   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 266   if (UseRTMXendForLockBusy) {
 267     xend();
 268     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 269     jmp(L_decrement_retry);
 270   }
 271   else {
 272     xabort(0);
 273   }
 274   bind(L_on_abort);
 275   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 276     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 277   }
 278   bind(L_decrement_retry);
 279   if (RTMRetryCount > 0) {
 280     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 281     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 282   }
 283 }
 284 
 285 // Use RTM for inflating locks
 286 // inputs: objReg (object to lock)
 287 //         boxReg (on-stack box address (displaced header location) - KILLED)
 288 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 290                                             Register scrReg, Register retry_on_busy_count_Reg,
 291                                             Register retry_on_abort_count_Reg,
 292                                             RTMLockingCounters* rtm_counters,
 293                                             Metadata* method_data, bool profile_rtm,
 294                                             Label& DONE_LABEL) {
 295   assert(UseRTMLocking, "why call this otherwise?");
 296   assert(tmpReg == rax, "");
 297   assert(scrReg == rdx, "");
 298   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 299   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 300 
 301   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 302   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 303   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 304 
 305   if (RTMRetryCount > 0) {
 306     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 307     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 308     bind(L_rtm_retry);
 309   }
 310   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 311     Label L_noincrement;
 312     if (RTMTotalCountIncrRate > 1) {
 313       // tmpReg, scrReg and flags are killed
 314       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 315     }
 316     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 317     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 318     bind(L_noincrement);
 319   }
 320   xbegin(L_on_abort);
 321   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 322   movptr(tmpReg, Address(tmpReg, owner_offset));
 323   testptr(tmpReg, tmpReg);
 324   jcc(Assembler::zero, DONE_LABEL);
 325   if (UseRTMXendForLockBusy) {
 326     xend();
 327     jmp(L_decrement_retry);
 328   }
 329   else {
 330     xabort(0);
 331   }
 332   bind(L_on_abort);
 333   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 334   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 335     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 336   }
 337   if (RTMRetryCount > 0) {
 338     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 339     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 340   }
 341 
 342   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 343   testptr(tmpReg, tmpReg) ;
 344   jccb(Assembler::notZero, L_decrement_retry) ;
 345 
 346   // Appears unlocked - try to swing _owner from null to non-null.
 347   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 348 #ifdef _LP64
 349   Register threadReg = r15_thread;
 350 #else
 351   get_thread(scrReg);
 352   Register threadReg = scrReg;
 353 #endif
 354   lock();
 355   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 356 
 357   if (RTMRetryCount > 0) {
 358     // success done else retry
 359     jccb(Assembler::equal, DONE_LABEL) ;
 360     bind(L_decrement_retry);
 361     // Spin and retry if lock is busy.
 362     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 363   }
 364   else {
 365     bind(L_decrement_retry);
 366   }
 367 }
 368 
 369 #endif //  INCLUDE_RTM_OPT
 370 
 371 // fast_lock and fast_unlock used by C2
 372 
 373 // Because the transitions from emitted code to the runtime
 374 // monitorenter/exit helper stubs are so slow it's critical that
 375 // we inline both the stack-locking fast path and the inflated fast path.
 376 //
 377 // See also: cmpFastLock and cmpFastUnlock.
 378 //
 379 // What follows is a specialized inline transliteration of the code
 380 // in enter() and exit(). If we're concerned about I$ bloat another
 381 // option would be to emit TrySlowEnter and TrySlowExit methods
 382 // at startup-time.  These methods would accept arguments as
 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 384 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 386 // In practice, however, the # of lock sites is bounded and is usually small.
 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 388 // if the processor uses simple bimodal branch predictors keyed by EIP
 389 // Since the helper routines would be called from multiple synchronization
 390 // sites.
 391 //
 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 394 // to those specialized methods.  That'd give us a mostly platform-independent
 395 // implementation that the JITs could optimize and inline at their pleasure.
 396 // Done correctly, the only time we'd need to cross to native could would be
 397 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 399 // (b) explicit barriers or fence operations.
 400 //
 401 // TODO:
 402 //
 403 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 404 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 405 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 406 //    the lock operators would typically be faster than reifying Self.
 407 //
 408 // *  Ideally I'd define the primitives as:
 409 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 410 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 411 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 412 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 413 //    Furthermore the register assignments are overconstrained, possibly resulting in
 414 //    sub-optimal code near the synchronization site.
 415 //
 416 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 417 //    Alternately, use a better sp-proximity test.
 418 //
 419 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 420 //    Either one is sufficient to uniquely identify a thread.
 421 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 422 //
 423 // *  Intrinsify notify() and notifyAll() for the common cases where the
 424 //    object is locked by the calling thread but the waitlist is empty.
 425 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 426 //
 427 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 428 //    But beware of excessive branch density on AMD Opterons.
 429 //
 430 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 431 //    or failure of the fast path.  If the fast path fails then we pass
 432 //    control to the slow path, typically in C.  In fast_lock and
 433 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 434 //    will emit a conditional branch immediately after the node.
 435 //    So we have branches to branches and lots of ICC.ZF games.
 436 //    Instead, it might be better to have C2 pass a "FailureLabel"
 437 //    into fast_lock and fast_unlock.  In the case of success, control
 438 //    will drop through the node.  ICC.ZF is undefined at exit.
 439 //    In the case of failure, the node will branch directly to the
 440 //    FailureLabel
 441 
 442 
 443 // obj: object to lock
 444 // box: on-stack box address (displaced header location) - KILLED
 445 // rax,: tmp -- KILLED
 446 // scr: tmp -- KILLED
 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 448                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 449                                  BiasedLockingCounters* counters,
 450                                  RTMLockingCounters* rtm_counters,
 451                                  RTMLockingCounters* stack_rtm_counters,
 452                                  Metadata* method_data,
 453                                  bool use_rtm, bool profile_rtm) {
 454   // Ensure the register assignments are disjoint
 455   assert(tmpReg == rax, "");
 456 
 457   if (use_rtm) {
 458     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 459   } else {
 460     assert(cx2Reg == noreg, "");
 461     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 462   }
 463 
 464   if (counters != NULL) {
 465     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 466   }
 467 
 468   // Possible cases that we'll encounter in fast_lock
 469   // ------------------------------------------------
 470   // * Inflated
 471   //    -- unlocked
 472   //    -- Locked
 473   //       = by self
 474   //       = by other
 475   // * biased
 476   //    -- by Self
 477   //    -- by other
 478   // * neutral
 479   // * stack-locked
 480   //    -- by self
 481   //       = sp-proximity test hits
 482   //       = sp-proximity test generates false-negative
 483   //    -- by other
 484   //
 485 
 486   Label IsInflated, DONE_LABEL;
 487 
 488   // it's stack-locked, biased or neutral
 489   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 490   // order to reduce the number of conditional branches in the most common cases.
 491   // Beware -- there's a subtle invariant that fetch of the markword
 492   // at [FETCH], below, will never observe a biased encoding (*101b).
 493   // If this invariant is not held we risk exclusion (safety) failure.
 494   if (UseBiasedLocking && !UseOptoBiasInlining) {
 495     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 496   }
 497 
 498 #if INCLUDE_RTM_OPT
 499   if (UseRTMForStackLocks && use_rtm) {
 500     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 501                       stack_rtm_counters, method_data, profile_rtm,
 502                       DONE_LABEL, IsInflated);
 503   }
 504 #endif // INCLUDE_RTM_OPT
 505 
 506   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 507   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 508   jccb(Assembler::notZero, IsInflated);
 509 
 510   // Attempt stack-locking ...
 511   orptr (tmpReg, markWord::unlocked_value);
 512   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 513   lock();
 514   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 515   if (counters != NULL) {
 516     cond_inc32(Assembler::equal,
 517                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 518   }
 519   jcc(Assembler::equal, DONE_LABEL);           // Success
 520 
 521   // Recursive locking.
 522   // The object is stack-locked: markword contains stack pointer to BasicLock.
 523   // Locked by current thread if difference with current SP is less than one page.
 524   subptr(tmpReg, rsp);
 525   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 526   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 527   movptr(Address(boxReg, 0), tmpReg);
 528   if (counters != NULL) {
 529     cond_inc32(Assembler::equal,
 530                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 531   }
 532   jmp(DONE_LABEL);
 533 
 534   bind(IsInflated);
 535   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 536 
 537 #if INCLUDE_RTM_OPT
 538   // Use the same RTM locking code in 32- and 64-bit VM.
 539   if (use_rtm) {
 540     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 541                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 542   } else {
 543 #endif // INCLUDE_RTM_OPT
 544 
 545 #ifndef _LP64
 546   // The object is inflated.
 547 
 548   // boxReg refers to the on-stack BasicLock in the current frame.
 549   // We'd like to write:
 550   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 551   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 552   // additional latency as we have another ST in the store buffer that must drain.
 553 
 554   // avoid ST-before-CAS
 555   // register juggle because we need tmpReg for cmpxchgptr below
 556   movptr(scrReg, boxReg);
 557   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 558 
 559   // Optimistic form: consider XORL tmpReg,tmpReg
 560   movptr(tmpReg, NULL_WORD);
 561 
 562   // Appears unlocked - try to swing _owner from null to non-null.
 563   // Ideally, I'd manifest "Self" with get_thread and then attempt
 564   // to CAS the register containing Self into m->Owner.
 565   // But we don't have enough registers, so instead we can either try to CAS
 566   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 567   // we later store "Self" into m->Owner.  Transiently storing a stack address
 568   // (rsp or the address of the box) into  m->owner is harmless.
 569   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 570   lock();
 571   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 572   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 573   // If we weren't able to swing _owner from NULL to the BasicLock
 574   // then take the slow path.
 575   jccb  (Assembler::notZero, DONE_LABEL);
 576   // update _owner from BasicLock to thread
 577   get_thread (scrReg);                    // beware: clobbers ICCs
 578   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 579   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 580 
 581   // If the CAS fails we can either retry or pass control to the slow path.
 582   // We use the latter tactic.
 583   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 584   // If the CAS was successful ...
 585   //   Self has acquired the lock
 586   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 587   // Intentional fall-through into DONE_LABEL ...
 588 #else // _LP64
 589   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 590   movq(scrReg, tmpReg);
 591   xorq(tmpReg, tmpReg);
 592   lock();
 593   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 594   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 595   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 596   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 597   // Intentional fall-through into DONE_LABEL ...
 598   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 599 #endif // _LP64
 600 #if INCLUDE_RTM_OPT
 601   } // use_rtm()
 602 #endif
 603   // DONE_LABEL is a hot target - we'd really like to place it at the
 604   // start of cache line by padding with NOPs.
 605   // See the AMD and Intel software optimization manuals for the
 606   // most efficient "long" NOP encodings.
 607   // Unfortunately none of our alignment mechanisms suffice.
 608   bind(DONE_LABEL);
 609 
 610   // At DONE_LABEL the icc ZFlag is set as follows ...
 611   // fast_unlock uses the same protocol.
 612   // ZFlag == 1 -> Success
 613   // ZFlag == 0 -> Failure - force control through the slow path
 614 }
 615 
 616 // obj: object to unlock
 617 // box: box address (displaced header location), killed.  Must be EAX.
 618 // tmp: killed, cannot be obj nor box.
 619 //
 620 // Some commentary on balanced locking:
 621 //
 622 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 623 // Methods that don't have provably balanced locking are forced to run in the
 624 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 625 // The interpreter provides two properties:
 626 // I1:  At return-time the interpreter automatically and quietly unlocks any
 627 //      objects acquired the current activation (frame).  Recall that the
 628 //      interpreter maintains an on-stack list of locks currently held by
 629 //      a frame.
 630 // I2:  If a method attempts to unlock an object that is not held by the
 631 //      the frame the interpreter throws IMSX.
 632 //
 633 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 634 // B() doesn't have provably balanced locking so it runs in the interpreter.
 635 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 636 // is still locked by A().
 637 //
 638 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 639 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 640 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 641 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 642 // Arguably given that the spec legislates the JNI case as undefined our implementation
 643 // could reasonably *avoid* checking owner in fast_unlock().
 644 // In the interest of performance we elide m->Owner==Self check in unlock.
 645 // A perfectly viable alternative is to elide the owner check except when
 646 // Xcheck:jni is enabled.
 647 
 648 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 649   assert(boxReg == rax, "");
 650   assert_different_registers(objReg, boxReg, tmpReg);
 651 
 652   Label DONE_LABEL, Stacked, CheckSucc;
 653 
 654   // Critically, the biased locking test must have precedence over
 655   // and appear before the (box->dhw == 0) recursive stack-lock test.
 656   if (UseBiasedLocking && !UseOptoBiasInlining) {
 657     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 658   }
 659 
 660 #if INCLUDE_RTM_OPT
 661   if (UseRTMForStackLocks && use_rtm) {
 662     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 663     Label L_regular_unlock;
 664     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 665     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 666     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 667     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 668     xend();                                                           // otherwise end...
 669     jmp(DONE_LABEL);                                                  // ... and we're done
 670     bind(L_regular_unlock);
 671   }
 672 #endif
 673 
 674   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 675   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 676   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 677   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 678   jccb  (Assembler::zero, Stacked);
 679 
 680   // It's inflated.
 681 #if INCLUDE_RTM_OPT
 682   if (use_rtm) {
 683     Label L_regular_inflated_unlock;
 684     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 685     movptr(boxReg, Address(tmpReg, owner_offset));
 686     testptr(boxReg, boxReg);
 687     jccb(Assembler::notZero, L_regular_inflated_unlock);
 688     xend();
 689     jmpb(DONE_LABEL);
 690     bind(L_regular_inflated_unlock);
 691   }
 692 #endif
 693 
 694   // Despite our balanced locking property we still check that m->_owner == Self
 695   // as java routines or native JNI code called by this thread might
 696   // have released the lock.
 697   // Refer to the comments in synchronizer.cpp for how we might encode extra
 698   // state in _succ so we can avoid fetching EntryList|cxq.
 699   //
 700   // I'd like to add more cases in fast_lock() and fast_unlock() --
 701   // such as recursive enter and exit -- but we have to be wary of
 702   // I$ bloat, T$ effects and BP$ effects.
 703   //
 704   // If there's no contention try a 1-0 exit.  That is, exit without
 705   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 706   // we detect and recover from the race that the 1-0 exit admits.
 707   //
 708   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 709   // before it STs null into _owner, releasing the lock.  Updates
 710   // to data protected by the critical section must be visible before
 711   // we drop the lock (and thus before any other thread could acquire
 712   // the lock and observe the fields protected by the lock).
 713   // IA32's memory-model is SPO, so STs are ordered with respect to
 714   // each other and there's no need for an explicit barrier (fence).
 715   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 716 #ifndef _LP64
 717   get_thread (boxReg);
 718 
 719   // Note that we could employ various encoding schemes to reduce
 720   // the number of loads below (currently 4) to just 2 or 3.
 721   // Refer to the comments in synchronizer.cpp.
 722   // In practice the chain of fetches doesn't seem to impact performance, however.
 723   xorptr(boxReg, boxReg);
 724   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 725   jccb  (Assembler::notZero, DONE_LABEL);
 726   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 727   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 728   jccb  (Assembler::notZero, CheckSucc);
 729   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 730   jmpb  (DONE_LABEL);
 731 
 732   bind (Stacked);
 733   // It's not inflated and it's not recursively stack-locked and it's not biased.
 734   // It must be stack-locked.
 735   // Try to reset the header to displaced header.
 736   // The "box" value on the stack is stable, so we can reload
 737   // and be assured we observe the same value as above.
 738   movptr(tmpReg, Address(boxReg, 0));
 739   lock();
 740   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 741   // Intention fall-thru into DONE_LABEL
 742 
 743   // DONE_LABEL is a hot target - we'd really like to place it at the
 744   // start of cache line by padding with NOPs.
 745   // See the AMD and Intel software optimization manuals for the
 746   // most efficient "long" NOP encodings.
 747   // Unfortunately none of our alignment mechanisms suffice.
 748   bind (CheckSucc);
 749 #else // _LP64
 750   // It's inflated
 751   xorptr(boxReg, boxReg);
 752   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 753   jccb  (Assembler::notZero, DONE_LABEL);
 754   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 755   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 756   jccb  (Assembler::notZero, CheckSucc);
 757   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 758   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 759   jmpb  (DONE_LABEL);
 760 
 761   // Try to avoid passing control into the slow_path ...
 762   Label LSuccess, LGoSlowPath ;
 763   bind  (CheckSucc);
 764 
 765   // The following optional optimization can be elided if necessary
 766   // Effectively: if (succ == null) goto slow path
 767   // The code reduces the window for a race, however,
 768   // and thus benefits performance.
 769   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 770   jccb  (Assembler::zero, LGoSlowPath);
 771 
 772   xorptr(boxReg, boxReg);
 773   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 774   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 775 
 776   // Memory barrier/fence
 777   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 778   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 779   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 780   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 781   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 782   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 783   lock(); addl(Address(rsp, 0), 0);
 784 
 785   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 786   jccb  (Assembler::notZero, LSuccess);
 787 
 788   // Rare inopportune interleaving - race.
 789   // The successor vanished in the small window above.
 790   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 791   // We need to ensure progress and succession.
 792   // Try to reacquire the lock.
 793   // If that fails then the new owner is responsible for succession and this
 794   // thread needs to take no further action and can exit via the fast path (success).
 795   // If the re-acquire succeeds then pass control into the slow path.
 796   // As implemented, this latter mode is horrible because we generated more
 797   // coherence traffic on the lock *and* artifically extended the critical section
 798   // length while by virtue of passing control into the slow path.
 799 
 800   // box is really RAX -- the following CMPXCHG depends on that binding
 801   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 802   lock();
 803   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 804   // There's no successor so we tried to regrab the lock.
 805   // If that didn't work, then another thread grabbed the
 806   // lock so we're done (and exit was a success).
 807   jccb  (Assembler::notEqual, LSuccess);
 808   // Intentional fall-through into slow path
 809 
 810   bind  (LGoSlowPath);
 811   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 812   jmpb  (DONE_LABEL);
 813 
 814   bind  (LSuccess);
 815   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 816   jmpb  (DONE_LABEL);
 817 
 818   bind  (Stacked);
 819   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 820   lock();
 821   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 822 
 823 #endif
 824   bind(DONE_LABEL);
 825 }
 826 
 827 //-------------------------------------------------------------------------------------------
 828 // Generic instructions support for use in .ad files C2 code generation
 829 
 830 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 831   if (dst != src) {
 832     movdqu(dst, src);
 833   }
 834   if (opcode == Op_AbsVD) {
 835     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 836   } else {
 837     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 838     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 839   }
 840 }
 841 
 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 843   if (opcode == Op_AbsVD) {
 844     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 845   } else {
 846     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 847     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 848   }
 849 }
 850 
 851 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 852   if (dst != src) {
 853     movdqu(dst, src);
 854   }
 855   if (opcode == Op_AbsVF) {
 856     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 857   } else {
 858     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 859     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 860   }
 861 }
 862 
 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 864   if (opcode == Op_AbsVF) {
 865     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 866   } else {
 867     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 868     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 873   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 874 
 875   if (opcode == Op_MinV) {
 876     if (elem_bt == T_BYTE) {
 877       pminsb(dst, src);
 878     } else if (elem_bt == T_SHORT) {
 879       pminsw(dst, src);
 880     } else if (elem_bt == T_INT) {
 881       pminsd(dst, src);
 882     } else {
 883       assert(elem_bt == T_LONG, "required");
 884       assert(tmp == xmm0, "required");
 885       movdqu(xmm0, dst);
 886       pcmpgtq(xmm0, src);
 887       blendvpd(dst, src);  // xmm0 as mask
 888     }
 889   } else { // opcode == Op_MaxV
 890     if (elem_bt == T_BYTE) {
 891       pmaxsb(dst, src);
 892     } else if (elem_bt == T_SHORT) {
 893       pmaxsw(dst, src);
 894     } else if (elem_bt == T_INT) {
 895       pmaxsd(dst, src);
 896     } else {
 897       assert(elem_bt == T_LONG, "required");
 898       assert(tmp == xmm0, "required");
 899       movdqu(xmm0, src);
 900       pcmpgtq(xmm0, dst);
 901       blendvpd(dst, src);  // xmm0 as mask
 902     }
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 907                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 908                                  int vlen_enc) {
 909   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 910 
 911   if (opcode == Op_MinV) {
 912     if (elem_bt == T_BYTE) {
 913       vpminsb(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_SHORT) {
 915       vpminsw(dst, src1, src2, vlen_enc);
 916     } else if (elem_bt == T_INT) {
 917       vpminsd(dst, src1, src2, vlen_enc);
 918     } else {
 919       assert(elem_bt == T_LONG, "required");
 920       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 921         vpminsq(dst, src1, src2, vlen_enc);
 922       } else {
 923         vpcmpgtq(dst, src1, src2, vlen_enc);
 924         vblendvpd(dst, src1, src2, dst, vlen_enc);
 925       }
 926     }
 927   } else { // opcode == Op_MaxV
 928     if (elem_bt == T_BYTE) {
 929       vpmaxsb(dst, src1, src2, vlen_enc);
 930     } else if (elem_bt == T_SHORT) {
 931       vpmaxsw(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_INT) {
 933       vpmaxsd(dst, src1, src2, vlen_enc);
 934     } else {
 935       assert(elem_bt == T_LONG, "required");
 936       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 937         vpmaxsq(dst, src1, src2, vlen_enc);
 938       } else {
 939         vpcmpgtq(dst, src1, src2, vlen_enc);
 940         vblendvpd(dst, src2, src1, dst, vlen_enc);
 941       }
 942     }
 943   }
 944 }
 945 
 946 // Float/Double min max
 947 
 948 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 949                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 950                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 951                                    int vlen_enc) {
 952   assert(UseAVX > 0, "required");
 953   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 954          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 955   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   if (!is_double_word && is_min) {
 961     vblendvps(atmp, a, b, a, vlen_enc);
 962     vblendvps(btmp, b, a, a, vlen_enc);
 963     vminps(tmp, atmp, btmp, vlen_enc);
 964     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 965     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 966   } else if (!is_double_word && !is_min) {
 967     vblendvps(btmp, b, a, b, vlen_enc);
 968     vblendvps(atmp, a, b, b, vlen_enc);
 969     vmaxps(tmp, atmp, btmp, vlen_enc);
 970     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 971     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 972   } else if (is_double_word && is_min) {
 973     vblendvpd(atmp, a, b, a, vlen_enc);
 974     vblendvpd(btmp, b, a, a, vlen_enc);
 975     vminpd(tmp, atmp, btmp, vlen_enc);
 976     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 977     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 978   } else {
 979     assert(is_double_word && !is_min, "sanity");
 980     vblendvpd(btmp, b, a, b, vlen_enc);
 981     vblendvpd(atmp, a, b, b, vlen_enc);
 982     vmaxpd(tmp, atmp, btmp, vlen_enc);
 983     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 984     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 985   }
 986 }
 987 
 988 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 989                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 990                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 991                                     int vlen_enc) {
 992   assert(UseAVX > 2, "required");
 993   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 994          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 995   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 996 
 997   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 998   bool is_double_word = is_double_word_type(elem_bt);
 999   bool merge = true;
1000 
1001   if (!is_double_word && is_min) {
1002     evpmovd2m(ktmp, a, vlen_enc);
1003     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1004     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1005     vminps(dst, atmp, btmp, vlen_enc);
1006     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1008   } else if (!is_double_word && !is_min) {
1009     evpmovd2m(ktmp, b, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vmaxps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (is_double_word && is_min) {
1016     evpmovq2m(ktmp, a, vlen_enc);
1017     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1019     vminpd(dst, atmp, btmp, vlen_enc);
1020     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1022   } else {
1023     assert(is_double_word && !is_min, "sanity");
1024     evpmovq2m(ktmp, b, vlen_enc);
1025     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1027     vmaxpd(dst, atmp, btmp, vlen_enc);
1028     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1030   }
1031 }
1032 
1033 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1034   if (sign) {
1035     pmovsxbw(dst, src);
1036   } else {
1037     pmovzxbw(dst, src);
1038   }
1039 }
1040 
1041 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1042   if (sign) {
1043     vpmovsxbw(dst, src, vector_len);
1044   } else {
1045     vpmovzxbw(dst, src, vector_len);
1046   }
1047 }
1048 
1049 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1050   if (sign) {
1051     vpmovsxbd(dst, src, vector_len);
1052   } else {
1053     vpmovzxbd(dst, src, vector_len);
1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1058   if (sign) {
1059     vpmovsxwd(dst, src, vector_len);
1060   } else {
1061     vpmovzxwd(dst, src, vector_len);
1062   }
1063 }
1064 
1065 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1066   switch (opcode) {
1067     case Op_RShiftVI:  psrad(dst, shift); break;
1068     case Op_LShiftVI:  pslld(dst, shift); break;
1069     case Op_URShiftVI: psrld(dst, shift); break;
1070 
1071     default: assert(false, "%s", NodeClassNames[opcode]);
1072   }
1073 }
1074 
1075 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1076   switch (opcode) {
1077     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1078     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1079     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1080 
1081     default: assert(false, "%s", NodeClassNames[opcode]);
1082   }
1083 }
1084 
1085 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1086   switch (opcode) {
1087     case Op_RShiftVB:  // fall-through
1088     case Op_RShiftVS:  psraw(dst, shift); break;
1089 
1090     case Op_LShiftVB:  // fall-through
1091     case Op_LShiftVS:  psllw(dst, shift);   break;
1092 
1093     case Op_URShiftVS: // fall-through
1094     case Op_URShiftVB: psrlw(dst, shift);  break;
1095 
1096     default: assert(false, "%s", NodeClassNames[opcode]);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1101   switch (opcode) {
1102     case Op_RShiftVB:  // fall-through
1103     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1104 
1105     case Op_LShiftVB:  // fall-through
1106     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1107 
1108     case Op_URShiftVS: // fall-through
1109     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1110 
1111     default: assert(false, "%s", NodeClassNames[opcode]);
1112   }
1113 }
1114 
1115 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1116   switch (opcode) {
1117     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1118     case Op_LShiftVL:  psllq(dst, shift); break;
1119     case Op_URShiftVL: psrlq(dst, shift); break;
1120 
1121     default: assert(false, "%s", NodeClassNames[opcode]);
1122   }
1123 }
1124 
1125 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1126   switch (opcode) {
1127     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1128     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1129     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1130 
1131     default: assert(false, "%s", NodeClassNames[opcode]);
1132   }
1133 }
1134 
1135 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1136   switch (opcode) {
1137     case Op_VRShiftV:  vpsravd(dst, src, shift, vlen_enc); break;
1138     case Op_VLShiftV:  vpsllvd(dst, src, shift, vlen_enc); break;
1139     case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break;
1140 
1141     default: assert(false, "%s", NodeClassNames[opcode]);
1142   }
1143 }
1144 
1145 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1146   switch (opcode) {
1147     case Op_VRShiftV:  evpsravw(dst, src, shift, vlen_enc); break;
1148     case Op_VLShiftV:  evpsllvw(dst, src, shift, vlen_enc); break;
1149     case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break;
1150 
1151     default: assert(false, "%s", NodeClassNames[opcode]);
1152   }
1153 }
1154 
1155 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1156   assert(UseAVX >= 2, "required");
1157   switch (opcode) {
1158     case Op_VRShiftV: {
1159       if (UseAVX > 2) {
1160         assert(tmp == xnoreg, "not used");
1161         if (!VM_Version::supports_avx512vl()) {
1162           vlen_enc = Assembler::AVX_512bit;
1163         }
1164         evpsravq(dst, src, shift, vlen_enc);
1165       } else {
1166         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1167         vpsrlvq(dst, src, shift, vlen_enc);
1168         vpsrlvq(tmp, tmp, shift, vlen_enc);
1169         vpxor(dst, dst, tmp, vlen_enc);
1170         vpsubq(dst, dst, tmp, vlen_enc);
1171       }
1172       break;
1173     }
1174     case Op_VLShiftV: {
1175       assert(tmp == xnoreg, "not used");
1176       vpsllvq(dst, src, shift, vlen_enc);
1177       break;
1178     }
1179     case Op_VURShiftV: {
1180       assert(tmp == xnoreg, "not used");
1181       vpsrlvq(dst, src, shift, vlen_enc);
1182       break;
1183     }
1184     default: assert(false, "%s", NodeClassNames[opcode]);
1185   }
1186 }
1187 
1188 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1189 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1190   bool sign = (opcode == Op_VURShiftV) ? false : true;
1191   assert(vector_len == 0, "required");
1192   vextendbd(sign, dst, src, 1);
1193   vpmovzxbd(vtmp, shift, 1);
1194   varshiftd(opcode, dst, dst, vtmp, 1);
1195   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1196   vextracti128_high(vtmp, dst);
1197   vpackusdw(dst, dst, vtmp, 0);
1198 }
1199 
1200 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1201 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1202   bool sign = (opcode == Op_VURShiftV) ? false : true;
1203   int ext_vector_len = vector_len + 1;
1204   vextendbw(sign, dst, src, ext_vector_len);
1205   vpmovzxbw(vtmp, shift, ext_vector_len);
1206   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1207   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1208   if (vector_len == 0) {
1209     vextracti128_high(vtmp, dst);
1210     vpackuswb(dst, dst, vtmp, vector_len);
1211   } else {
1212     vextracti64x4_high(vtmp, dst);
1213     vpackuswb(dst, dst, vtmp, vector_len);
1214     vpermq(dst, dst, 0xD8, vector_len);
1215   }
1216 }
1217 
1218 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1219   switch(typ) {
1220     case T_BYTE:
1221       pinsrb(dst, val, idx);
1222       break;
1223     case T_SHORT:
1224       pinsrw(dst, val, idx);
1225       break;
1226     case T_INT:
1227       pinsrd(dst, val, idx);
1228       break;
1229     case T_LONG:
1230       pinsrq(dst, val, idx);
1231       break;
1232     default:
1233       assert(false,"Should not reach here.");
1234       break;
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1239   switch(typ) {
1240     case T_BYTE:
1241       vpinsrb(dst, src, val, idx);
1242       break;
1243     case T_SHORT:
1244       vpinsrw(dst, src, val, idx);
1245       break;
1246     case T_INT:
1247       vpinsrd(dst, src, val, idx);
1248       break;
1249     case T_LONG:
1250       vpinsrq(dst, src, val, idx);
1251       break;
1252     default:
1253       assert(false,"Should not reach here.");
1254       break;
1255   }
1256 }
1257 
1258 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1259   switch(typ) {
1260     case T_INT:
1261       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1262       break;
1263     case T_FLOAT:
1264       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1265       break;
1266     case T_LONG:
1267       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1268       break;
1269     case T_DOUBLE:
1270       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1271       break;
1272     default:
1273       assert(false,"Should not reach here.");
1274       break;
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1279   switch(typ) {
1280     case T_INT:
1281       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1282       break;
1283     case T_FLOAT:
1284       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1285       break;
1286     case T_LONG:
1287       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1288       break;
1289     case T_DOUBLE:
1290       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1291       break;
1292     default:
1293       assert(false,"Should not reach here.");
1294       break;
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1299   switch(typ) {
1300     case T_INT:
1301       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1302       break;
1303     case T_FLOAT:
1304       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1305       break;
1306     case T_LONG:
1307       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1308       break;
1309     case T_DOUBLE:
1310       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1311       break;
1312     default:
1313       assert(false,"Should not reach here.");
1314       break;
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1319   if (vlen_in_bytes <= 16) {
1320     pxor (dst, dst);
1321     psubb(dst, src);
1322     switch (elem_bt) {
1323       case T_BYTE:   /* nothing to do */ break;
1324       case T_SHORT:  pmovsxbw(dst, dst); break;
1325       case T_INT:    pmovsxbd(dst, dst); break;
1326       case T_FLOAT:  pmovsxbd(dst, dst); break;
1327       case T_LONG:   pmovsxbq(dst, dst); break;
1328       case T_DOUBLE: pmovsxbq(dst, dst); break;
1329 
1330       default: assert(false, "%s", type2name(elem_bt));
1331     }
1332   } else {
1333     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1334 
1335     vpxor (dst, dst, dst, vlen_enc);
1336     vpsubb(dst, dst, src, vlen_enc);
1337     switch (elem_bt) {
1338       case T_BYTE:   /* nothing to do */            break;
1339       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1340       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1341       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1342       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1343       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1344 
1345       default: assert(false, "%s", type2name(elem_bt));
1346     }
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1351   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1352   if (vlen_in_bytes <= 16) {
1353     movdqu(dst, addr, scratch);
1354   } else if (vlen_in_bytes == 32) {
1355     vmovdqu(dst, addr, scratch);
1356   } else {
1357     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1358     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1359   }
1360 }
1361 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1362 
1363 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1364   int vector_len = Assembler::AVX_128bit;
1365 
1366   switch (opcode) {
1367     case Op_AndReductionV:  pand(dst, src); break;
1368     case Op_OrReductionV:   por (dst, src); break;
1369     case Op_XorReductionV:  pxor(dst, src); break;
1370     case Op_MinReductionV:
1371       switch (typ) {
1372         case T_BYTE:        pminsb(dst, src); break;
1373         case T_SHORT:       pminsw(dst, src); break;
1374         case T_INT:         pminsd(dst, src); break;
1375         case T_LONG:        assert(UseAVX > 2, "required");
1376                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1377         default:            assert(false, "wrong type");
1378       }
1379       break;
1380     case Op_MaxReductionV:
1381       switch (typ) {
1382         case T_BYTE:        pmaxsb(dst, src); break;
1383         case T_SHORT:       pmaxsw(dst, src); break;
1384         case T_INT:         pmaxsd(dst, src); break;
1385         case T_LONG:        assert(UseAVX > 2, "required");
1386                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1387         default:            assert(false, "wrong type");
1388       }
1389       break;
1390     case Op_AddReductionVF: addss(dst, src); break;
1391     case Op_AddReductionVD: addsd(dst, src); break;
1392     case Op_AddReductionVI:
1393       switch (typ) {
1394         case T_BYTE:        paddb(dst, src); break;
1395         case T_SHORT:       paddw(dst, src); break;
1396         case T_INT:         paddd(dst, src); break;
1397         default:            assert(false, "wrong type");
1398       }
1399       break;
1400     case Op_AddReductionVL: paddq(dst, src); break;
1401     case Op_MulReductionVF: mulss(dst, src); break;
1402     case Op_MulReductionVD: mulsd(dst, src); break;
1403     case Op_MulReductionVI:
1404       switch (typ) {
1405         case T_SHORT:       pmullw(dst, src); break;
1406         case T_INT:         pmulld(dst, src); break;
1407         default:            assert(false, "wrong type");
1408       }
1409       break;
1410     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1411                             vpmullq(dst, dst, src, vector_len); break;
1412     default:                assert(false, "wrong opcode");
1413   }
1414 }
1415 
1416 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1417   int vector_len = Assembler::AVX_256bit;
1418 
1419   switch (opcode) {
1420     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1421     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1422     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1423     case Op_MinReductionV:
1424       switch (typ) {
1425         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1426         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1427         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1428         case T_LONG:        assert(UseAVX > 2, "required");
1429                             vpminsq(dst, src1, src2, vector_len); break;
1430         default:            assert(false, "wrong type");
1431       }
1432       break;
1433     case Op_MaxReductionV:
1434       switch (typ) {
1435         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1436         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1437         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1438         case T_LONG:        assert(UseAVX > 2, "required");
1439                             vpmaxsq(dst, src1, src2, vector_len); break;
1440         default:            assert(false, "wrong type");
1441       }
1442       break;
1443     case Op_AddReductionVI:
1444       switch (typ) {
1445         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1446         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1447         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1448         default:            assert(false, "wrong type");
1449       }
1450       break;
1451     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1452     case Op_MulReductionVI:
1453       switch (typ) {
1454         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1455         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1456         default:            assert(false, "wrong type");
1457       }
1458       break;
1459     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1460     default:                assert(false, "wrong opcode");
1461   }
1462 }
1463 
1464 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1465                                   XMMRegister dst, XMMRegister src,
1466                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1467   switch (opcode) {
1468     case Op_AddReductionVF:
1469     case Op_MulReductionVF:
1470       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1471       break;
1472 
1473     case Op_AddReductionVD:
1474     case Op_MulReductionVD:
1475       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1476       break;
1477 
1478     default: assert(false, "wrong opcode");
1479   }
1480 }
1481 
1482 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1483                              Register dst, Register src1, XMMRegister src2,
1484                              XMMRegister vtmp1, XMMRegister vtmp2) {
1485   switch (vlen) {
1486     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1487     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1488     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1489     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1490 
1491     default: assert(false, "wrong vector length");
1492   }
1493 }
1494 
1495 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1496                              Register dst, Register src1, XMMRegister src2,
1497                              XMMRegister vtmp1, XMMRegister vtmp2) {
1498   switch (vlen) {
1499     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1500     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1501     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1502     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1503 
1504     default: assert(false, "wrong vector length");
1505   }
1506 }
1507 
1508 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1509                              Register dst, Register src1, XMMRegister src2,
1510                              XMMRegister vtmp1, XMMRegister vtmp2) {
1511   switch (vlen) {
1512     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1513     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1514     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1515     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1516 
1517     default: assert(false, "wrong vector length");
1518   }
1519 }
1520 
1521 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1522                              Register dst, Register src1, XMMRegister src2,
1523                              XMMRegister vtmp1, XMMRegister vtmp2) {
1524   switch (vlen) {
1525     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1526     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1527     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1528     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1529 
1530     default: assert(false, "wrong vector length");
1531   }
1532 }
1533 
1534 #ifdef _LP64
1535 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1536                              Register dst, Register src1, XMMRegister src2,
1537                              XMMRegister vtmp1, XMMRegister vtmp2) {
1538   switch (vlen) {
1539     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1540     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1541     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1542 
1543     default: assert(false, "wrong vector length");
1544   }
1545 }
1546 #endif // _LP64
1547 
1548 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1549   switch (vlen) {
1550     case 2:
1551       assert(vtmp2 == xnoreg, "");
1552       reduce2F(opcode, dst, src, vtmp1);
1553       break;
1554     case 4:
1555       assert(vtmp2 == xnoreg, "");
1556       reduce4F(opcode, dst, src, vtmp1);
1557       break;
1558     case 8:
1559       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1560       break;
1561     case 16:
1562       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1563       break;
1564     default: assert(false, "wrong vector length");
1565   }
1566 }
1567 
1568 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1569   switch (vlen) {
1570     case 2:
1571       assert(vtmp2 == xnoreg, "");
1572       reduce2D(opcode, dst, src, vtmp1);
1573       break;
1574     case 4:
1575       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1576       break;
1577     case 8:
1578       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1579       break;
1580     default: assert(false, "wrong vector length");
1581   }
1582 }
1583 
1584 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1585   if (opcode == Op_AddReductionVI) {
1586     if (vtmp1 != src2) {
1587       movdqu(vtmp1, src2);
1588     }
1589     phaddd(vtmp1, vtmp1);
1590   } else {
1591     pshufd(vtmp1, src2, 0x1);
1592     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1593   }
1594   movdl(vtmp2, src1);
1595   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1596   movdl(dst, vtmp1);
1597 }
1598 
1599 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1600   if (opcode == Op_AddReductionVI) {
1601     if (vtmp1 != src2) {
1602       movdqu(vtmp1, src2);
1603     }
1604     phaddd(vtmp1, src2);
1605     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1606   } else {
1607     pshufd(vtmp2, src2, 0xE);
1608     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1609     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1610   }
1611 }
1612 
1613 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1614   if (opcode == Op_AddReductionVI) {
1615     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1616     vextracti128_high(vtmp2, vtmp1);
1617     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1618     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1619   } else {
1620     vextracti128_high(vtmp1, src2);
1621     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1622     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1627   vextracti64x4_high(vtmp2, src2);
1628   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1629   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1630 }
1631 
1632 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1633   pshufd(vtmp2, src2, 0x1);
1634   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1635   movdqu(vtmp1, vtmp2);
1636   psrldq(vtmp1, 2);
1637   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1638   movdqu(vtmp2, vtmp1);
1639   psrldq(vtmp2, 1);
1640   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1641   movdl(vtmp2, src1);
1642   pmovsxbd(vtmp1, vtmp1);
1643   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1644   pextrb(dst, vtmp1, 0x0);
1645   movsbl(dst, dst);
1646 }
1647 
1648 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1649   pshufd(vtmp1, src2, 0xE);
1650   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1651   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1652 }
1653 
1654 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1655   vextracti128_high(vtmp2, src2);
1656   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1657   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1658 }
1659 
1660 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1661   vextracti64x4_high(vtmp1, src2);
1662   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1663   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1664 }
1665 
1666 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1667   pmovsxbw(vtmp2, src2);
1668   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1669 }
1670 
1671 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1672   if (UseAVX > 1) {
1673     int vector_len = Assembler::AVX_256bit;
1674     vpmovsxbw(vtmp1, src2, vector_len);
1675     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1676   } else {
1677     pmovsxbw(vtmp2, src2);
1678     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1679     pshufd(vtmp2, src2, 0x1);
1680     pmovsxbw(vtmp2, src2);
1681     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1682   }
1683 }
1684 
1685 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1686   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1687     int vector_len = Assembler::AVX_512bit;
1688     vpmovsxbw(vtmp1, src2, vector_len);
1689     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1690   } else {
1691     assert(UseAVX >= 2,"Should not reach here.");
1692     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1693     vextracti128_high(vtmp2, src2);
1694     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1695   }
1696 }
1697 
1698 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1699   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1700   vextracti64x4_high(vtmp2, src2);
1701   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1702 }
1703 
1704 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1705   if (opcode == Op_AddReductionVI) {
1706     if (vtmp1 != src2) {
1707       movdqu(vtmp1, src2);
1708     }
1709     phaddw(vtmp1, vtmp1);
1710     phaddw(vtmp1, vtmp1);
1711   } else {
1712     pshufd(vtmp2, src2, 0x1);
1713     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1714     movdqu(vtmp1, vtmp2);
1715     psrldq(vtmp1, 2);
1716     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1717   }
1718   movdl(vtmp2, src1);
1719   pmovsxwd(vtmp1, vtmp1);
1720   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1721   pextrw(dst, vtmp1, 0x0);
1722   movswl(dst, dst);
1723 }
1724 
1725 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1726   if (opcode == Op_AddReductionVI) {
1727     if (vtmp1 != src2) {
1728       movdqu(vtmp1, src2);
1729     }
1730     phaddw(vtmp1, src2);
1731   } else {
1732     pshufd(vtmp1, src2, 0xE);
1733     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1734   }
1735   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1736 }
1737 
1738 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1739   if (opcode == Op_AddReductionVI) {
1740     int vector_len = Assembler::AVX_256bit;
1741     vphaddw(vtmp2, src2, src2, vector_len);
1742     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1743   } else {
1744     vextracti128_high(vtmp2, src2);
1745     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1746   }
1747   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1748 }
1749 
1750 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1751   int vector_len = Assembler::AVX_256bit;
1752   vextracti64x4_high(vtmp1, src2);
1753   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1754   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1755 }
1756 
1757 #ifdef _LP64
1758 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1759   pshufd(vtmp2, src2, 0xE);
1760   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1761   movdq(vtmp1, src1);
1762   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1763   movdq(dst, vtmp1);
1764 }
1765 
1766 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1767   vextracti128_high(vtmp1, src2);
1768   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1769   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1770 }
1771 
1772 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1773   vextracti64x4_high(vtmp2, src2);
1774   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1775   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1776 }
1777 #endif // _LP64
1778 
1779 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1780   reduce_operation_128(T_FLOAT, opcode, dst, src);
1781   pshufd(vtmp, src, 0x1);
1782   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1783 }
1784 
1785 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1786   reduce2F(opcode, dst, src, vtmp);
1787   pshufd(vtmp, src, 0x2);
1788   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1789   pshufd(vtmp, src, 0x3);
1790   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1791 }
1792 
1793 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1794   reduce4F(opcode, dst, src, vtmp2);
1795   vextractf128_high(vtmp2, src);
1796   reduce4F(opcode, dst, vtmp2, vtmp1);
1797 }
1798 
1799 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1800   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1801   vextracti64x4_high(vtmp1, src);
1802   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1803 }
1804 
1805 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1806   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1807   pshufd(vtmp, src, 0xE);
1808   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1809 }
1810 
1811 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1812   reduce2D(opcode, dst, src, vtmp2);
1813   vextractf128_high(vtmp2, src);
1814   reduce2D(opcode, dst, vtmp2, vtmp1);
1815 }
1816 
1817 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1818   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1819   vextracti64x4_high(vtmp1, src);
1820   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1821 }
1822 
1823 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1824                                           XMMRegister dst, XMMRegister src,
1825                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1826                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1827   int permconst[] = {1, 14};
1828   XMMRegister wsrc = src;
1829   XMMRegister wdst = xmm_0;
1830   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1831 
1832   int vlen_enc = Assembler::AVX_128bit;
1833   if (vlen == 16) {
1834     vlen_enc = Assembler::AVX_256bit;
1835   }
1836 
1837   for (int i = log2(vlen) - 1; i >=0; i--) {
1838     if (i == 0 && !is_dst_valid) {
1839       wdst = dst;
1840     }
1841     if (i == 3) {
1842       vextracti64x4_high(wtmp, wsrc);
1843     } else if (i == 2) {
1844       vextracti128_high(wtmp, wsrc);
1845     } else { // i = [0,1]
1846       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1847     }
1848     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1849     wsrc = wdst;
1850     vlen_enc = Assembler::AVX_128bit;
1851   }
1852   if (is_dst_valid) {
1853     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1854   }
1855 }
1856 
1857 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1858                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1859                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1860   XMMRegister wsrc = src;
1861   XMMRegister wdst = xmm_0;
1862   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1863   int vlen_enc = Assembler::AVX_128bit;
1864   if (vlen == 8) {
1865     vlen_enc = Assembler::AVX_256bit;
1866   }
1867   for (int i = log2(vlen) - 1; i >=0; i--) {
1868     if (i == 0 && !is_dst_valid) {
1869       wdst = dst;
1870     }
1871     if (i == 1) {
1872       vextracti128_high(wtmp, wsrc);
1873     } else if (i == 2) {
1874       vextracti64x4_high(wtmp, wsrc);
1875     } else {
1876       assert(i == 0, "%d", i);
1877       vpermilpd(wtmp, wsrc, 1, vlen_enc);
1878     }
1879     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1880     wsrc = wdst;
1881     vlen_enc = Assembler::AVX_128bit;
1882   }
1883   if (is_dst_valid) {
1884     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1885   }
1886 }
1887 
1888 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
1889   switch (bt) {
1890     case T_BYTE:  pextrb(dst, src, idx); break;
1891     case T_SHORT: pextrw(dst, src, idx); break;
1892     case T_INT:   pextrd(dst, src, idx); break;
1893     case T_LONG:  pextrq(dst, src, idx); break;
1894 
1895     default:
1896       assert(false,"Should not reach here.");
1897       break;
1898   }
1899 }
1900 
1901 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
1902   int esize =  type2aelembytes(typ);
1903   int elem_per_lane = 16/esize;
1904   int lane = elemindex / elem_per_lane;
1905   int eindex = elemindex % elem_per_lane;
1906 
1907   if (lane >= 2) {
1908     assert(UseAVX > 2, "required");
1909     vextractf32x4(dst, src, lane & 3);
1910     return dst;
1911   } else if (lane > 0) {
1912     assert(UseAVX > 0, "required");
1913     vextractf128(dst, src, lane);
1914     return dst;
1915   } else {
1916     return src;
1917   }
1918 }
1919 
1920 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
1921   int esize =  type2aelembytes(typ);
1922   int elem_per_lane = 16/esize;
1923   int eindex = elemindex % elem_per_lane;
1924   assert(is_integral_type(typ),"required");
1925 
1926   if (eindex == 0) {
1927     if (typ == T_LONG) {
1928       movq(dst, src);
1929     } else {
1930       movdl(dst, src);
1931       if (typ == T_BYTE)
1932         movsbl(dst, dst);
1933       else if (typ == T_SHORT)
1934         movswl(dst, dst);
1935     }
1936   } else {
1937     extract(typ, dst, src, eindex);
1938   }
1939 }
1940 
1941 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
1942   int esize =  type2aelembytes(typ);
1943   int elem_per_lane = 16/esize;
1944   int eindex = elemindex % elem_per_lane;
1945   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
1946 
1947   if (eindex == 0) {
1948     movq(dst, src);
1949   } else {
1950     if (typ == T_FLOAT) {
1951       if (UseAVX == 0) {
1952         movdqu(dst, src);
1953         pshufps(dst, dst, eindex);
1954       } else {
1955         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
1956       }
1957     } else {
1958       if (UseAVX == 0) {
1959         movdqu(dst, src);
1960         psrldq(dst, eindex*esize);
1961       } else {
1962         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
1963       }
1964       movq(dst, dst);
1965     }
1966   }
1967   // Zero upper bits
1968   if (typ == T_FLOAT) {
1969     if (UseAVX == 0) {
1970       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
1971       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
1972       pand(dst, vtmp);
1973     } else {
1974       assert((tmp != noreg), "required.");
1975       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
1976     }
1977   }
1978 }
1979 
1980 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
1981   switch(typ) {
1982     case T_BYTE:
1983       evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1984       break;
1985     case T_SHORT:
1986       evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1987       break;
1988     case T_INT:
1989     case T_FLOAT:
1990       evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1991       break;
1992     case T_LONG:
1993     case T_DOUBLE:
1994       evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1995       break;
1996     default:
1997       assert(false,"Should not reach here.");
1998       break;
1999   }
2000 }
2001 
2002 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2003   switch(typ) {
2004     case T_BYTE:
2005       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2006       break;
2007     case T_SHORT:
2008       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2009       break;
2010     case T_INT:
2011     case T_FLOAT:
2012       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2013       break;
2014     case T_LONG:
2015     case T_DOUBLE:
2016       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2017       break;
2018     default:
2019       assert(false,"Should not reach here.");
2020       break;
2021   }
2022 }
2023 
2024 //-------------------------------------------------------------------------------------------
2025 
2026 // IndexOf for constant substrings with size >= 8 chars
2027 // which don't need to be loaded through stack.
2028 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2029                                          Register cnt1, Register cnt2,
2030                                          int int_cnt2,  Register result,
2031                                          XMMRegister vec, Register tmp,
2032                                          int ae) {
2033   ShortBranchVerifier sbv(this);
2034   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2035   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2036 
2037   // This method uses the pcmpestri instruction with bound registers
2038   //   inputs:
2039   //     xmm - substring
2040   //     rax - substring length (elements count)
2041   //     mem - scanned string
2042   //     rdx - string length (elements count)
2043   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2044   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2045   //   outputs:
2046   //     rcx - matched index in string
2047   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2048   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2049   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2050   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2051   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2052 
2053   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2054         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2055         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2056 
2057   // Note, inline_string_indexOf() generates checks:
2058   // if (substr.count > string.count) return -1;
2059   // if (substr.count == 0) return 0;
2060   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2061 
2062   // Load substring.
2063   if (ae == StrIntrinsicNode::UL) {
2064     pmovzxbw(vec, Address(str2, 0));
2065   } else {
2066     movdqu(vec, Address(str2, 0));
2067   }
2068   movl(cnt2, int_cnt2);
2069   movptr(result, str1); // string addr
2070 
2071   if (int_cnt2 > stride) {
2072     jmpb(SCAN_TO_SUBSTR);
2073 
2074     // Reload substr for rescan, this code
2075     // is executed only for large substrings (> 8 chars)
2076     bind(RELOAD_SUBSTR);
2077     if (ae == StrIntrinsicNode::UL) {
2078       pmovzxbw(vec, Address(str2, 0));
2079     } else {
2080       movdqu(vec, Address(str2, 0));
2081     }
2082     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2083 
2084     bind(RELOAD_STR);
2085     // We came here after the beginning of the substring was
2086     // matched but the rest of it was not so we need to search
2087     // again. Start from the next element after the previous match.
2088 
2089     // cnt2 is number of substring reminding elements and
2090     // cnt1 is number of string reminding elements when cmp failed.
2091     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2092     subl(cnt1, cnt2);
2093     addl(cnt1, int_cnt2);
2094     movl(cnt2, int_cnt2); // Now restore cnt2
2095 
2096     decrementl(cnt1);     // Shift to next element
2097     cmpl(cnt1, cnt2);
2098     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2099 
2100     addptr(result, (1<<scale1));
2101 
2102   } // (int_cnt2 > 8)
2103 
2104   // Scan string for start of substr in 16-byte vectors
2105   bind(SCAN_TO_SUBSTR);
2106   pcmpestri(vec, Address(result, 0), mode);
2107   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2108   subl(cnt1, stride);
2109   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2110   cmpl(cnt1, cnt2);
2111   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2112   addptr(result, 16);
2113   jmpb(SCAN_TO_SUBSTR);
2114 
2115   // Found a potential substr
2116   bind(FOUND_CANDIDATE);
2117   // Matched whole vector if first element matched (tmp(rcx) == 0).
2118   if (int_cnt2 == stride) {
2119     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2120   } else { // int_cnt2 > 8
2121     jccb(Assembler::overflow, FOUND_SUBSTR);
2122   }
2123   // After pcmpestri tmp(rcx) contains matched element index
2124   // Compute start addr of substr
2125   lea(result, Address(result, tmp, scale1));
2126 
2127   // Make sure string is still long enough
2128   subl(cnt1, tmp);
2129   cmpl(cnt1, cnt2);
2130   if (int_cnt2 == stride) {
2131     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2132   } else { // int_cnt2 > 8
2133     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2134   }
2135   // Left less then substring.
2136 
2137   bind(RET_NOT_FOUND);
2138   movl(result, -1);
2139   jmp(EXIT);
2140 
2141   if (int_cnt2 > stride) {
2142     // This code is optimized for the case when whole substring
2143     // is matched if its head is matched.
2144     bind(MATCH_SUBSTR_HEAD);
2145     pcmpestri(vec, Address(result, 0), mode);
2146     // Reload only string if does not match
2147     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2148 
2149     Label CONT_SCAN_SUBSTR;
2150     // Compare the rest of substring (> 8 chars).
2151     bind(FOUND_SUBSTR);
2152     // First 8 chars are already matched.
2153     negptr(cnt2);
2154     addptr(cnt2, stride);
2155 
2156     bind(SCAN_SUBSTR);
2157     subl(cnt1, stride);
2158     cmpl(cnt2, -stride); // Do not read beyond substring
2159     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2160     // Back-up strings to avoid reading beyond substring:
2161     // cnt1 = cnt1 - cnt2 + 8
2162     addl(cnt1, cnt2); // cnt2 is negative
2163     addl(cnt1, stride);
2164     movl(cnt2, stride); negptr(cnt2);
2165     bind(CONT_SCAN_SUBSTR);
2166     if (int_cnt2 < (int)G) {
2167       int tail_off1 = int_cnt2<<scale1;
2168       int tail_off2 = int_cnt2<<scale2;
2169       if (ae == StrIntrinsicNode::UL) {
2170         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2171       } else {
2172         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2173       }
2174       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2175     } else {
2176       // calculate index in register to avoid integer overflow (int_cnt2*2)
2177       movl(tmp, int_cnt2);
2178       addptr(tmp, cnt2);
2179       if (ae == StrIntrinsicNode::UL) {
2180         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2181       } else {
2182         movdqu(vec, Address(str2, tmp, scale2, 0));
2183       }
2184       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2185     }
2186     // Need to reload strings pointers if not matched whole vector
2187     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2188     addptr(cnt2, stride);
2189     jcc(Assembler::negative, SCAN_SUBSTR);
2190     // Fall through if found full substring
2191 
2192   } // (int_cnt2 > 8)
2193 
2194   bind(RET_FOUND);
2195   // Found result if we matched full small substring.
2196   // Compute substr offset
2197   subptr(result, str1);
2198   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2199     shrl(result, 1); // index
2200   }
2201   bind(EXIT);
2202 
2203 } // string_indexofC8
2204 
2205 // Small strings are loaded through stack if they cross page boundary.
2206 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2207                                        Register cnt1, Register cnt2,
2208                                        int int_cnt2,  Register result,
2209                                        XMMRegister vec, Register tmp,
2210                                        int ae) {
2211   ShortBranchVerifier sbv(this);
2212   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2213   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2214 
2215   //
2216   // int_cnt2 is length of small (< 8 chars) constant substring
2217   // or (-1) for non constant substring in which case its length
2218   // is in cnt2 register.
2219   //
2220   // Note, inline_string_indexOf() generates checks:
2221   // if (substr.count > string.count) return -1;
2222   // if (substr.count == 0) return 0;
2223   //
2224   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2225   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2226   // This method uses the pcmpestri instruction with bound registers
2227   //   inputs:
2228   //     xmm - substring
2229   //     rax - substring length (elements count)
2230   //     mem - scanned string
2231   //     rdx - string length (elements count)
2232   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2233   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2234   //   outputs:
2235   //     rcx - matched index in string
2236   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2237   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2238   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2239   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2240 
2241   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2242         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2243         FOUND_CANDIDATE;
2244 
2245   { //========================================================
2246     // We don't know where these strings are located
2247     // and we can't read beyond them. Load them through stack.
2248     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2249 
2250     movptr(tmp, rsp); // save old SP
2251 
2252     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2253       if (int_cnt2 == (1>>scale2)) { // One byte
2254         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2255         load_unsigned_byte(result, Address(str2, 0));
2256         movdl(vec, result); // move 32 bits
2257       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2258         // Not enough header space in 32-bit VM: 12+3 = 15.
2259         movl(result, Address(str2, -1));
2260         shrl(result, 8);
2261         movdl(vec, result); // move 32 bits
2262       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2263         load_unsigned_short(result, Address(str2, 0));
2264         movdl(vec, result); // move 32 bits
2265       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2266         movdl(vec, Address(str2, 0)); // move 32 bits
2267       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2268         movq(vec, Address(str2, 0));  // move 64 bits
2269       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2270         // Array header size is 12 bytes in 32-bit VM
2271         // + 6 bytes for 3 chars == 18 bytes,
2272         // enough space to load vec and shift.
2273         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2274         if (ae == StrIntrinsicNode::UL) {
2275           int tail_off = int_cnt2-8;
2276           pmovzxbw(vec, Address(str2, tail_off));
2277           psrldq(vec, -2*tail_off);
2278         }
2279         else {
2280           int tail_off = int_cnt2*(1<<scale2);
2281           movdqu(vec, Address(str2, tail_off-16));
2282           psrldq(vec, 16-tail_off);
2283         }
2284       }
2285     } else { // not constant substring
2286       cmpl(cnt2, stride);
2287       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2288 
2289       // We can read beyond string if srt+16 does not cross page boundary
2290       // since heaps are aligned and mapped by pages.
2291       assert(os::vm_page_size() < (int)G, "default page should be small");
2292       movl(result, str2); // We need only low 32 bits
2293       andl(result, (os::vm_page_size()-1));
2294       cmpl(result, (os::vm_page_size()-16));
2295       jccb(Assembler::belowEqual, CHECK_STR);
2296 
2297       // Move small strings to stack to allow load 16 bytes into vec.
2298       subptr(rsp, 16);
2299       int stk_offset = wordSize-(1<<scale2);
2300       push(cnt2);
2301 
2302       bind(COPY_SUBSTR);
2303       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2304         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2305         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2306       } else if (ae == StrIntrinsicNode::UU) {
2307         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2308         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2309       }
2310       decrement(cnt2);
2311       jccb(Assembler::notZero, COPY_SUBSTR);
2312 
2313       pop(cnt2);
2314       movptr(str2, rsp);  // New substring address
2315     } // non constant
2316 
2317     bind(CHECK_STR);
2318     cmpl(cnt1, stride);
2319     jccb(Assembler::aboveEqual, BIG_STRINGS);
2320 
2321     // Check cross page boundary.
2322     movl(result, str1); // We need only low 32 bits
2323     andl(result, (os::vm_page_size()-1));
2324     cmpl(result, (os::vm_page_size()-16));
2325     jccb(Assembler::belowEqual, BIG_STRINGS);
2326 
2327     subptr(rsp, 16);
2328     int stk_offset = -(1<<scale1);
2329     if (int_cnt2 < 0) { // not constant
2330       push(cnt2);
2331       stk_offset += wordSize;
2332     }
2333     movl(cnt2, cnt1);
2334 
2335     bind(COPY_STR);
2336     if (ae == StrIntrinsicNode::LL) {
2337       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2338       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2339     } else {
2340       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2341       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2342     }
2343     decrement(cnt2);
2344     jccb(Assembler::notZero, COPY_STR);
2345 
2346     if (int_cnt2 < 0) { // not constant
2347       pop(cnt2);
2348     }
2349     movptr(str1, rsp);  // New string address
2350 
2351     bind(BIG_STRINGS);
2352     // Load substring.
2353     if (int_cnt2 < 0) { // -1
2354       if (ae == StrIntrinsicNode::UL) {
2355         pmovzxbw(vec, Address(str2, 0));
2356       } else {
2357         movdqu(vec, Address(str2, 0));
2358       }
2359       push(cnt2);       // substr count
2360       push(str2);       // substr addr
2361       push(str1);       // string addr
2362     } else {
2363       // Small (< 8 chars) constant substrings are loaded already.
2364       movl(cnt2, int_cnt2);
2365     }
2366     push(tmp);  // original SP
2367 
2368   } // Finished loading
2369 
2370   //========================================================
2371   // Start search
2372   //
2373 
2374   movptr(result, str1); // string addr
2375 
2376   if (int_cnt2  < 0) {  // Only for non constant substring
2377     jmpb(SCAN_TO_SUBSTR);
2378 
2379     // SP saved at sp+0
2380     // String saved at sp+1*wordSize
2381     // Substr saved at sp+2*wordSize
2382     // Substr count saved at sp+3*wordSize
2383 
2384     // Reload substr for rescan, this code
2385     // is executed only for large substrings (> 8 chars)
2386     bind(RELOAD_SUBSTR);
2387     movptr(str2, Address(rsp, 2*wordSize));
2388     movl(cnt2, Address(rsp, 3*wordSize));
2389     if (ae == StrIntrinsicNode::UL) {
2390       pmovzxbw(vec, Address(str2, 0));
2391     } else {
2392       movdqu(vec, Address(str2, 0));
2393     }
2394     // We came here after the beginning of the substring was
2395     // matched but the rest of it was not so we need to search
2396     // again. Start from the next element after the previous match.
2397     subptr(str1, result); // Restore counter
2398     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2399       shrl(str1, 1);
2400     }
2401     addl(cnt1, str1);
2402     decrementl(cnt1);   // Shift to next element
2403     cmpl(cnt1, cnt2);
2404     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2405 
2406     addptr(result, (1<<scale1));
2407   } // non constant
2408 
2409   // Scan string for start of substr in 16-byte vectors
2410   bind(SCAN_TO_SUBSTR);
2411   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2412   pcmpestri(vec, Address(result, 0), mode);
2413   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2414   subl(cnt1, stride);
2415   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2416   cmpl(cnt1, cnt2);
2417   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2418   addptr(result, 16);
2419 
2420   bind(ADJUST_STR);
2421   cmpl(cnt1, stride); // Do not read beyond string
2422   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2423   // Back-up string to avoid reading beyond string.
2424   lea(result, Address(result, cnt1, scale1, -16));
2425   movl(cnt1, stride);
2426   jmpb(SCAN_TO_SUBSTR);
2427 
2428   // Found a potential substr
2429   bind(FOUND_CANDIDATE);
2430   // After pcmpestri tmp(rcx) contains matched element index
2431 
2432   // Make sure string is still long enough
2433   subl(cnt1, tmp);
2434   cmpl(cnt1, cnt2);
2435   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2436   // Left less then substring.
2437 
2438   bind(RET_NOT_FOUND);
2439   movl(result, -1);
2440   jmp(CLEANUP);
2441 
2442   bind(FOUND_SUBSTR);
2443   // Compute start addr of substr
2444   lea(result, Address(result, tmp, scale1));
2445   if (int_cnt2 > 0) { // Constant substring
2446     // Repeat search for small substring (< 8 chars)
2447     // from new point without reloading substring.
2448     // Have to check that we don't read beyond string.
2449     cmpl(tmp, stride-int_cnt2);
2450     jccb(Assembler::greater, ADJUST_STR);
2451     // Fall through if matched whole substring.
2452   } else { // non constant
2453     assert(int_cnt2 == -1, "should be != 0");
2454 
2455     addl(tmp, cnt2);
2456     // Found result if we matched whole substring.
2457     cmpl(tmp, stride);
2458     jcc(Assembler::lessEqual, RET_FOUND);
2459 
2460     // Repeat search for small substring (<= 8 chars)
2461     // from new point 'str1' without reloading substring.
2462     cmpl(cnt2, stride);
2463     // Have to check that we don't read beyond string.
2464     jccb(Assembler::lessEqual, ADJUST_STR);
2465 
2466     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2467     // Compare the rest of substring (> 8 chars).
2468     movptr(str1, result);
2469 
2470     cmpl(tmp, cnt2);
2471     // First 8 chars are already matched.
2472     jccb(Assembler::equal, CHECK_NEXT);
2473 
2474     bind(SCAN_SUBSTR);
2475     pcmpestri(vec, Address(str1, 0), mode);
2476     // Need to reload strings pointers if not matched whole vector
2477     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2478 
2479     bind(CHECK_NEXT);
2480     subl(cnt2, stride);
2481     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2482     addptr(str1, 16);
2483     if (ae == StrIntrinsicNode::UL) {
2484       addptr(str2, 8);
2485     } else {
2486       addptr(str2, 16);
2487     }
2488     subl(cnt1, stride);
2489     cmpl(cnt2, stride); // Do not read beyond substring
2490     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2491     // Back-up strings to avoid reading beyond substring.
2492 
2493     if (ae == StrIntrinsicNode::UL) {
2494       lea(str2, Address(str2, cnt2, scale2, -8));
2495       lea(str1, Address(str1, cnt2, scale1, -16));
2496     } else {
2497       lea(str2, Address(str2, cnt2, scale2, -16));
2498       lea(str1, Address(str1, cnt2, scale1, -16));
2499     }
2500     subl(cnt1, cnt2);
2501     movl(cnt2, stride);
2502     addl(cnt1, stride);
2503     bind(CONT_SCAN_SUBSTR);
2504     if (ae == StrIntrinsicNode::UL) {
2505       pmovzxbw(vec, Address(str2, 0));
2506     } else {
2507       movdqu(vec, Address(str2, 0));
2508     }
2509     jmp(SCAN_SUBSTR);
2510 
2511     bind(RET_FOUND_LONG);
2512     movptr(str1, Address(rsp, wordSize));
2513   } // non constant
2514 
2515   bind(RET_FOUND);
2516   // Compute substr offset
2517   subptr(result, str1);
2518   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2519     shrl(result, 1); // index
2520   }
2521   bind(CLEANUP);
2522   pop(rsp); // restore SP
2523 
2524 } // string_indexof
2525 
2526 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2527                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2528   ShortBranchVerifier sbv(this);
2529   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2530 
2531   int stride = 8;
2532 
2533   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2534         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2535         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2536         FOUND_SEQ_CHAR, DONE_LABEL;
2537 
2538   movptr(result, str1);
2539   if (UseAVX >= 2) {
2540     cmpl(cnt1, stride);
2541     jcc(Assembler::less, SCAN_TO_CHAR);
2542     cmpl(cnt1, 2*stride);
2543     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2544     movdl(vec1, ch);
2545     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2546     vpxor(vec2, vec2);
2547     movl(tmp, cnt1);
2548     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2549     andl(cnt1,0x0000000F);  //tail count (in chars)
2550 
2551     bind(SCAN_TO_16_CHAR_LOOP);
2552     vmovdqu(vec3, Address(result, 0));
2553     vpcmpeqw(vec3, vec3, vec1, 1);
2554     vptest(vec2, vec3);
2555     jcc(Assembler::carryClear, FOUND_CHAR);
2556     addptr(result, 32);
2557     subl(tmp, 2*stride);
2558     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2559     jmp(SCAN_TO_8_CHAR);
2560     bind(SCAN_TO_8_CHAR_INIT);
2561     movdl(vec1, ch);
2562     pshuflw(vec1, vec1, 0x00);
2563     pshufd(vec1, vec1, 0);
2564     pxor(vec2, vec2);
2565   }
2566   bind(SCAN_TO_8_CHAR);
2567   cmpl(cnt1, stride);
2568   jcc(Assembler::less, SCAN_TO_CHAR);
2569   if (UseAVX < 2) {
2570     movdl(vec1, ch);
2571     pshuflw(vec1, vec1, 0x00);
2572     pshufd(vec1, vec1, 0);
2573     pxor(vec2, vec2);
2574   }
2575   movl(tmp, cnt1);
2576   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2577   andl(cnt1,0x00000007);  //tail count (in chars)
2578 
2579   bind(SCAN_TO_8_CHAR_LOOP);
2580   movdqu(vec3, Address(result, 0));
2581   pcmpeqw(vec3, vec1);
2582   ptest(vec2, vec3);
2583   jcc(Assembler::carryClear, FOUND_CHAR);
2584   addptr(result, 16);
2585   subl(tmp, stride);
2586   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2587   bind(SCAN_TO_CHAR);
2588   testl(cnt1, cnt1);
2589   jcc(Assembler::zero, RET_NOT_FOUND);
2590   bind(SCAN_TO_CHAR_LOOP);
2591   load_unsigned_short(tmp, Address(result, 0));
2592   cmpl(ch, tmp);
2593   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2594   addptr(result, 2);
2595   subl(cnt1, 1);
2596   jccb(Assembler::zero, RET_NOT_FOUND);
2597   jmp(SCAN_TO_CHAR_LOOP);
2598 
2599   bind(RET_NOT_FOUND);
2600   movl(result, -1);
2601   jmpb(DONE_LABEL);
2602 
2603   bind(FOUND_CHAR);
2604   if (UseAVX >= 2) {
2605     vpmovmskb(tmp, vec3);
2606   } else {
2607     pmovmskb(tmp, vec3);
2608   }
2609   bsfl(ch, tmp);
2610   addl(result, ch);
2611 
2612   bind(FOUND_SEQ_CHAR);
2613   subptr(result, str1);
2614   shrl(result, 1);
2615 
2616   bind(DONE_LABEL);
2617 } // string_indexof_char
2618 
2619 // helper function for string_compare
2620 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2621                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2622                                            Address::ScaleFactor scale2, Register index, int ae) {
2623   if (ae == StrIntrinsicNode::LL) {
2624     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2625     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2626   } else if (ae == StrIntrinsicNode::UU) {
2627     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2628     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2629   } else {
2630     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2631     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2632   }
2633 }
2634 
2635 // Compare strings, used for char[] and byte[].
2636 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2637                                        Register cnt1, Register cnt2, Register result,
2638                                        XMMRegister vec1, int ae) {
2639   ShortBranchVerifier sbv(this);
2640   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2641   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
2642   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2643   int stride2x2 = 0x40;
2644   Address::ScaleFactor scale = Address::no_scale;
2645   Address::ScaleFactor scale1 = Address::no_scale;
2646   Address::ScaleFactor scale2 = Address::no_scale;
2647 
2648   if (ae != StrIntrinsicNode::LL) {
2649     stride2x2 = 0x20;
2650   }
2651 
2652   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2653     shrl(cnt2, 1);
2654   }
2655   // Compute the minimum of the string lengths and the
2656   // difference of the string lengths (stack).
2657   // Do the conditional move stuff
2658   movl(result, cnt1);
2659   subl(cnt1, cnt2);
2660   push(cnt1);
2661   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
2662 
2663   // Is the minimum length zero?
2664   testl(cnt2, cnt2);
2665   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2666   if (ae == StrIntrinsicNode::LL) {
2667     // Load first bytes
2668     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
2669     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
2670   } else if (ae == StrIntrinsicNode::UU) {
2671     // Load first characters
2672     load_unsigned_short(result, Address(str1, 0));
2673     load_unsigned_short(cnt1, Address(str2, 0));
2674   } else {
2675     load_unsigned_byte(result, Address(str1, 0));
2676     load_unsigned_short(cnt1, Address(str2, 0));
2677   }
2678   subl(result, cnt1);
2679   jcc(Assembler::notZero,  POP_LABEL);
2680 
2681   if (ae == StrIntrinsicNode::UU) {
2682     // Divide length by 2 to get number of chars
2683     shrl(cnt2, 1);
2684   }
2685   cmpl(cnt2, 1);
2686   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2687 
2688   // Check if the strings start at the same location and setup scale and stride
2689   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2690     cmpptr(str1, str2);
2691     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2692     if (ae == StrIntrinsicNode::LL) {
2693       scale = Address::times_1;
2694       stride = 16;
2695     } else {
2696       scale = Address::times_2;
2697       stride = 8;
2698     }
2699   } else {
2700     scale1 = Address::times_1;
2701     scale2 = Address::times_2;
2702     // scale not used
2703     stride = 8;
2704   }
2705 
2706   if (UseAVX >= 2 && UseSSE42Intrinsics) {
2707     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
2708     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
2709     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
2710     Label COMPARE_TAIL_LONG;
2711     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
2712 
2713     int pcmpmask = 0x19;
2714     if (ae == StrIntrinsicNode::LL) {
2715       pcmpmask &= ~0x01;
2716     }
2717 
2718     // Setup to compare 16-chars (32-bytes) vectors,
2719     // start from first character again because it has aligned address.
2720     if (ae == StrIntrinsicNode::LL) {
2721       stride2 = 32;
2722     } else {
2723       stride2 = 16;
2724     }
2725     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2726       adr_stride = stride << scale;
2727     } else {
2728       adr_stride1 = 8;  //stride << scale1;
2729       adr_stride2 = 16; //stride << scale2;
2730     }
2731 
2732     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2733     // rax and rdx are used by pcmpestri as elements counters
2734     movl(result, cnt2);
2735     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
2736     jcc(Assembler::zero, COMPARE_TAIL_LONG);
2737 
2738     // fast path : compare first 2 8-char vectors.
2739     bind(COMPARE_16_CHARS);
2740     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2741       movdqu(vec1, Address(str1, 0));
2742     } else {
2743       pmovzxbw(vec1, Address(str1, 0));
2744     }
2745     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2746     jccb(Assembler::below, COMPARE_INDEX_CHAR);
2747 
2748     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2749       movdqu(vec1, Address(str1, adr_stride));
2750       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
2751     } else {
2752       pmovzxbw(vec1, Address(str1, adr_stride1));
2753       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
2754     }
2755     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
2756     addl(cnt1, stride);
2757 
2758     // Compare the characters at index in cnt1
2759     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
2760     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2761     subl(result, cnt2);
2762     jmp(POP_LABEL);
2763 
2764     // Setup the registers to start vector comparison loop
2765     bind(COMPARE_WIDE_VECTORS);
2766     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2767       lea(str1, Address(str1, result, scale));
2768       lea(str2, Address(str2, result, scale));
2769     } else {
2770       lea(str1, Address(str1, result, scale1));
2771       lea(str2, Address(str2, result, scale2));
2772     }
2773     subl(result, stride2);
2774     subl(cnt2, stride2);
2775     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
2776     negptr(result);
2777 
2778     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
2779     bind(COMPARE_WIDE_VECTORS_LOOP);
2780 
2781 #ifdef _LP64
2782     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2783       cmpl(cnt2, stride2x2);
2784       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2785       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
2786       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
2787 
2788       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2789       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2790         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
2791         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2792       } else {
2793         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
2794         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2795       }
2796       kortestql(k7, k7);
2797       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
2798       addptr(result, stride2x2);  // update since we already compared at this addr
2799       subl(cnt2, stride2x2);      // and sub the size too
2800       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2801 
2802       vpxor(vec1, vec1);
2803       jmpb(COMPARE_WIDE_TAIL);
2804     }//if (VM_Version::supports_avx512vlbw())
2805 #endif // _LP64
2806 
2807 
2808     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2809     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2810       vmovdqu(vec1, Address(str1, result, scale));
2811       vpxor(vec1, Address(str2, result, scale));
2812     } else {
2813       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
2814       vpxor(vec1, Address(str2, result, scale2));
2815     }
2816     vptest(vec1, vec1);
2817     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
2818     addptr(result, stride2);
2819     subl(cnt2, stride2);
2820     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
2821     // clean upper bits of YMM registers
2822     vpxor(vec1, vec1);
2823 
2824     // compare wide vectors tail
2825     bind(COMPARE_WIDE_TAIL);
2826     testptr(result, result);
2827     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2828 
2829     movl(result, stride2);
2830     movl(cnt2, result);
2831     negptr(result);
2832     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2833 
2834     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
2835     bind(VECTOR_NOT_EQUAL);
2836     // clean upper bits of YMM registers
2837     vpxor(vec1, vec1);
2838     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2839       lea(str1, Address(str1, result, scale));
2840       lea(str2, Address(str2, result, scale));
2841     } else {
2842       lea(str1, Address(str1, result, scale1));
2843       lea(str2, Address(str2, result, scale2));
2844     }
2845     jmp(COMPARE_16_CHARS);
2846 
2847     // Compare tail chars, length between 1 to 15 chars
2848     bind(COMPARE_TAIL_LONG);
2849     movl(cnt2, result);
2850     cmpl(cnt2, stride);
2851     jcc(Assembler::less, COMPARE_SMALL_STR);
2852 
2853     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2854       movdqu(vec1, Address(str1, 0));
2855     } else {
2856       pmovzxbw(vec1, Address(str1, 0));
2857     }
2858     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2859     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2860     subptr(cnt2, stride);
2861     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2862     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2863       lea(str1, Address(str1, result, scale));
2864       lea(str2, Address(str2, result, scale));
2865     } else {
2866       lea(str1, Address(str1, result, scale1));
2867       lea(str2, Address(str2, result, scale2));
2868     }
2869     negptr(cnt2);
2870     jmpb(WHILE_HEAD_LABEL);
2871 
2872     bind(COMPARE_SMALL_STR);
2873   } else if (UseSSE42Intrinsics) {
2874     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2875     int pcmpmask = 0x19;
2876     // Setup to compare 8-char (16-byte) vectors,
2877     // start from first character again because it has aligned address.
2878     movl(result, cnt2);
2879     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2880     if (ae == StrIntrinsicNode::LL) {
2881       pcmpmask &= ~0x01;
2882     }
2883     jcc(Assembler::zero, COMPARE_TAIL);
2884     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2885       lea(str1, Address(str1, result, scale));
2886       lea(str2, Address(str2, result, scale));
2887     } else {
2888       lea(str1, Address(str1, result, scale1));
2889       lea(str2, Address(str2, result, scale2));
2890     }
2891     negptr(result);
2892 
2893     // pcmpestri
2894     //   inputs:
2895     //     vec1- substring
2896     //     rax - negative string length (elements count)
2897     //     mem - scanned string
2898     //     rdx - string length (elements count)
2899     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2900     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2901     //   outputs:
2902     //     rcx - first mismatched element index
2903     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2904 
2905     bind(COMPARE_WIDE_VECTORS);
2906     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2907       movdqu(vec1, Address(str1, result, scale));
2908       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2909     } else {
2910       pmovzxbw(vec1, Address(str1, result, scale1));
2911       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2912     }
2913     // After pcmpestri cnt1(rcx) contains mismatched element index
2914 
2915     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2916     addptr(result, stride);
2917     subptr(cnt2, stride);
2918     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2919 
2920     // compare wide vectors tail
2921     testptr(result, result);
2922     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2923 
2924     movl(cnt2, stride);
2925     movl(result, stride);
2926     negptr(result);
2927     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2928       movdqu(vec1, Address(str1, result, scale));
2929       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2930     } else {
2931       pmovzxbw(vec1, Address(str1, result, scale1));
2932       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2933     }
2934     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2935 
2936     // Mismatched characters in the vectors
2937     bind(VECTOR_NOT_EQUAL);
2938     addptr(cnt1, result);
2939     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2940     subl(result, cnt2);
2941     jmpb(POP_LABEL);
2942 
2943     bind(COMPARE_TAIL); // limit is zero
2944     movl(cnt2, result);
2945     // Fallthru to tail compare
2946   }
2947   // Shift str2 and str1 to the end of the arrays, negate min
2948   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2949     lea(str1, Address(str1, cnt2, scale));
2950     lea(str2, Address(str2, cnt2, scale));
2951   } else {
2952     lea(str1, Address(str1, cnt2, scale1));
2953     lea(str2, Address(str2, cnt2, scale2));
2954   }
2955   decrementl(cnt2);  // first character was compared already
2956   negptr(cnt2);
2957 
2958   // Compare the rest of the elements
2959   bind(WHILE_HEAD_LABEL);
2960   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2961   subl(result, cnt1);
2962   jccb(Assembler::notZero, POP_LABEL);
2963   increment(cnt2);
2964   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2965 
2966   // Strings are equal up to min length.  Return the length difference.
2967   bind(LENGTH_DIFF_LABEL);
2968   pop(result);
2969   if (ae == StrIntrinsicNode::UU) {
2970     // Divide diff by 2 to get number of chars
2971     sarl(result, 1);
2972   }
2973   jmpb(DONE_LABEL);
2974 
2975 #ifdef _LP64
2976   if (VM_Version::supports_avx512vlbw()) {
2977 
2978     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2979 
2980     kmovql(cnt1, k7);
2981     notq(cnt1);
2982     bsfq(cnt2, cnt1);
2983     if (ae != StrIntrinsicNode::LL) {
2984       // Divide diff by 2 to get number of chars
2985       sarl(cnt2, 1);
2986     }
2987     addq(result, cnt2);
2988     if (ae == StrIntrinsicNode::LL) {
2989       load_unsigned_byte(cnt1, Address(str2, result));
2990       load_unsigned_byte(result, Address(str1, result));
2991     } else if (ae == StrIntrinsicNode::UU) {
2992       load_unsigned_short(cnt1, Address(str2, result, scale));
2993       load_unsigned_short(result, Address(str1, result, scale));
2994     } else {
2995       load_unsigned_short(cnt1, Address(str2, result, scale2));
2996       load_unsigned_byte(result, Address(str1, result, scale1));
2997     }
2998     subl(result, cnt1);
2999     jmpb(POP_LABEL);
3000   }//if (VM_Version::supports_avx512vlbw())
3001 #endif // _LP64
3002 
3003   // Discard the stored length difference
3004   bind(POP_LABEL);
3005   pop(cnt1);
3006 
3007   // That's it
3008   bind(DONE_LABEL);
3009   if(ae == StrIntrinsicNode::UL) {
3010     negl(result);
3011   }
3012 
3013 }
3014 
3015 // Search for Non-ASCII character (Negative byte value) in a byte array,
3016 // return true if it has any and false otherwise.
3017 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3018 //   @HotSpotIntrinsicCandidate
3019 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3020 //     for (int i = off; i < off + len; i++) {
3021 //       if (ba[i] < 0) {
3022 //         return true;
3023 //       }
3024 //     }
3025 //     return false;
3026 //   }
3027 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3028   Register result, Register tmp1,
3029   XMMRegister vec1, XMMRegister vec2) {
3030   // rsi: byte array
3031   // rcx: len
3032   // rax: result
3033   ShortBranchVerifier sbv(this);
3034   assert_different_registers(ary1, len, result, tmp1);
3035   assert_different_registers(vec1, vec2);
3036   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3037 
3038   // len == 0
3039   testl(len, len);
3040   jcc(Assembler::zero, FALSE_LABEL);
3041 
3042   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3043     VM_Version::supports_avx512vlbw() &&
3044     VM_Version::supports_bmi2()) {
3045 
3046     Label test_64_loop, test_tail;
3047     Register tmp3_aliased = len;
3048 
3049     movl(tmp1, len);
3050     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3051 
3052     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3053     andl(len, ~(64 - 1));    // vector count (in chars)
3054     jccb(Assembler::zero, test_tail);
3055 
3056     lea(ary1, Address(ary1, len, Address::times_1));
3057     negptr(len);
3058 
3059     bind(test_64_loop);
3060     // Check whether our 64 elements of size byte contain negatives
3061     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3062     kortestql(k2, k2);
3063     jcc(Assembler::notZero, TRUE_LABEL);
3064 
3065     addptr(len, 64);
3066     jccb(Assembler::notZero, test_64_loop);
3067 
3068 
3069     bind(test_tail);
3070     // bail out when there is nothing to be done
3071     testl(tmp1, -1);
3072     jcc(Assembler::zero, FALSE_LABEL);
3073 
3074     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3075 #ifdef _LP64
3076     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3077     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3078     notq(tmp3_aliased);
3079     kmovql(k3, tmp3_aliased);
3080 #else
3081     Label k_init;
3082     jmp(k_init);
3083 
3084     // We could not read 64-bits from a general purpose register thus we move
3085     // data required to compose 64 1's to the instruction stream
3086     // We emit 64 byte wide series of elements from 0..63 which later on would
3087     // be used as a compare targets with tail count contained in tmp1 register.
3088     // Result would be a k register having tmp1 consecutive number or 1
3089     // counting from least significant bit.
3090     address tmp = pc();
3091     emit_int64(0x0706050403020100);
3092     emit_int64(0x0F0E0D0C0B0A0908);
3093     emit_int64(0x1716151413121110);
3094     emit_int64(0x1F1E1D1C1B1A1918);
3095     emit_int64(0x2726252423222120);
3096     emit_int64(0x2F2E2D2C2B2A2928);
3097     emit_int64(0x3736353433323130);
3098     emit_int64(0x3F3E3D3C3B3A3938);
3099 
3100     bind(k_init);
3101     lea(len, InternalAddress(tmp));
3102     // create mask to test for negative byte inside a vector
3103     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3104     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
3105 
3106 #endif
3107     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3108     ktestq(k2, k3);
3109     jcc(Assembler::notZero, TRUE_LABEL);
3110 
3111     jmp(FALSE_LABEL);
3112   } else {
3113     movl(result, len); // copy
3114 
3115     if (UseAVX >= 2 && UseSSE >= 2) {
3116       // With AVX2, use 32-byte vector compare
3117       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3118 
3119       // Compare 32-byte vectors
3120       andl(result, 0x0000001f);  //   tail count (in bytes)
3121       andl(len, 0xffffffe0);   // vector count (in bytes)
3122       jccb(Assembler::zero, COMPARE_TAIL);
3123 
3124       lea(ary1, Address(ary1, len, Address::times_1));
3125       negptr(len);
3126 
3127       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3128       movdl(vec2, tmp1);
3129       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3130 
3131       bind(COMPARE_WIDE_VECTORS);
3132       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3133       vptest(vec1, vec2);
3134       jccb(Assembler::notZero, TRUE_LABEL);
3135       addptr(len, 32);
3136       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3137 
3138       testl(result, result);
3139       jccb(Assembler::zero, FALSE_LABEL);
3140 
3141       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3142       vptest(vec1, vec2);
3143       jccb(Assembler::notZero, TRUE_LABEL);
3144       jmpb(FALSE_LABEL);
3145 
3146       bind(COMPARE_TAIL); // len is zero
3147       movl(len, result);
3148       // Fallthru to tail compare
3149     } else if (UseSSE42Intrinsics) {
3150       // With SSE4.2, use double quad vector compare
3151       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3152 
3153       // Compare 16-byte vectors
3154       andl(result, 0x0000000f);  //   tail count (in bytes)
3155       andl(len, 0xfffffff0);   // vector count (in bytes)
3156       jcc(Assembler::zero, COMPARE_TAIL);
3157 
3158       lea(ary1, Address(ary1, len, Address::times_1));
3159       negptr(len);
3160 
3161       movl(tmp1, 0x80808080);
3162       movdl(vec2, tmp1);
3163       pshufd(vec2, vec2, 0);
3164 
3165       bind(COMPARE_WIDE_VECTORS);
3166       movdqu(vec1, Address(ary1, len, Address::times_1));
3167       ptest(vec1, vec2);
3168       jcc(Assembler::notZero, TRUE_LABEL);
3169       addptr(len, 16);
3170       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3171 
3172       testl(result, result);
3173       jcc(Assembler::zero, FALSE_LABEL);
3174 
3175       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3176       ptest(vec1, vec2);
3177       jccb(Assembler::notZero, TRUE_LABEL);
3178       jmpb(FALSE_LABEL);
3179 
3180       bind(COMPARE_TAIL); // len is zero
3181       movl(len, result);
3182       // Fallthru to tail compare
3183     }
3184   }
3185   // Compare 4-byte vectors
3186   andl(len, 0xfffffffc); // vector count (in bytes)
3187   jccb(Assembler::zero, COMPARE_CHAR);
3188 
3189   lea(ary1, Address(ary1, len, Address::times_1));
3190   negptr(len);
3191 
3192   bind(COMPARE_VECTORS);
3193   movl(tmp1, Address(ary1, len, Address::times_1));
3194   andl(tmp1, 0x80808080);
3195   jccb(Assembler::notZero, TRUE_LABEL);
3196   addptr(len, 4);
3197   jcc(Assembler::notZero, COMPARE_VECTORS);
3198 
3199   // Compare trailing char (final 2 bytes), if any
3200   bind(COMPARE_CHAR);
3201   testl(result, 0x2);   // tail  char
3202   jccb(Assembler::zero, COMPARE_BYTE);
3203   load_unsigned_short(tmp1, Address(ary1, 0));
3204   andl(tmp1, 0x00008080);
3205   jccb(Assembler::notZero, TRUE_LABEL);
3206   subptr(result, 2);
3207   lea(ary1, Address(ary1, 2));
3208 
3209   bind(COMPARE_BYTE);
3210   testl(result, 0x1);   // tail  byte
3211   jccb(Assembler::zero, FALSE_LABEL);
3212   load_unsigned_byte(tmp1, Address(ary1, 0));
3213   andl(tmp1, 0x00000080);
3214   jccb(Assembler::notEqual, TRUE_LABEL);
3215   jmpb(FALSE_LABEL);
3216 
3217   bind(TRUE_LABEL);
3218   movl(result, 1);   // return true
3219   jmpb(DONE);
3220 
3221   bind(FALSE_LABEL);
3222   xorl(result, result); // return false
3223 
3224   // That's it
3225   bind(DONE);
3226   if (UseAVX >= 2 && UseSSE >= 2) {
3227     // clean upper bits of YMM registers
3228     vpxor(vec1, vec1);
3229     vpxor(vec2, vec2);
3230   }
3231 }
3232 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3233 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3234                                       Register limit, Register result, Register chr,
3235                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
3236   ShortBranchVerifier sbv(this);
3237   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3238 
3239   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3240   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3241 
3242   if (is_array_equ) {
3243     // Check the input args
3244     cmpoop(ary1, ary2);
3245     jcc(Assembler::equal, TRUE_LABEL);
3246 
3247     // Need additional checks for arrays_equals.
3248     testptr(ary1, ary1);
3249     jcc(Assembler::zero, FALSE_LABEL);
3250     testptr(ary2, ary2);
3251     jcc(Assembler::zero, FALSE_LABEL);
3252 
3253     // Check the lengths
3254     movl(limit, Address(ary1, length_offset));
3255     cmpl(limit, Address(ary2, length_offset));
3256     jcc(Assembler::notEqual, FALSE_LABEL);
3257   }
3258 
3259   // count == 0
3260   testl(limit, limit);
3261   jcc(Assembler::zero, TRUE_LABEL);
3262 
3263   if (is_array_equ) {
3264     // Load array address
3265     lea(ary1, Address(ary1, base_offset));
3266     lea(ary2, Address(ary2, base_offset));
3267   }
3268 
3269   if (is_array_equ && is_char) {
3270     // arrays_equals when used for char[].
3271     shll(limit, 1);      // byte count != 0
3272   }
3273   movl(result, limit); // copy
3274 
3275   if (UseAVX >= 2) {
3276     // With AVX2, use 32-byte vector compare
3277     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3278 
3279     // Compare 32-byte vectors
3280     andl(result, 0x0000001f);  //   tail count (in bytes)
3281     andl(limit, 0xffffffe0);   // vector count (in bytes)
3282     jcc(Assembler::zero, COMPARE_TAIL);
3283 
3284     lea(ary1, Address(ary1, limit, Address::times_1));
3285     lea(ary2, Address(ary2, limit, Address::times_1));
3286     negptr(limit);
3287 
3288 #ifdef _LP64
3289     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3290       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3291 
3292       cmpl(limit, -64);
3293       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3294 
3295       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3296 
3297       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3298       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3299       kortestql(k7, k7);
3300       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3301       addptr(limit, 64);  // update since we already compared at this addr
3302       cmpl(limit, -64);
3303       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3304 
3305       // At this point we may still need to compare -limit+result bytes.
3306       // We could execute the next two instruction and just continue via non-wide path:
3307       //  cmpl(limit, 0);
3308       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3309       // But since we stopped at the points ary{1,2}+limit which are
3310       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3311       // (|limit| <= 32 and result < 32),
3312       // we may just compare the last 64 bytes.
3313       //
3314       addptr(result, -64);   // it is safe, bc we just came from this area
3315       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3316       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3317       kortestql(k7, k7);
3318       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3319 
3320       jmp(TRUE_LABEL);
3321 
3322       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3323 
3324     }//if (VM_Version::supports_avx512vlbw())
3325 #endif //_LP64
3326     bind(COMPARE_WIDE_VECTORS);
3327     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3328     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3329     vpxor(vec1, vec2);
3330 
3331     vptest(vec1, vec1);
3332     jcc(Assembler::notZero, FALSE_LABEL);
3333     addptr(limit, 32);
3334     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3335 
3336     testl(result, result);
3337     jcc(Assembler::zero, TRUE_LABEL);
3338 
3339     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3340     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3341     vpxor(vec1, vec2);
3342 
3343     vptest(vec1, vec1);
3344     jccb(Assembler::notZero, FALSE_LABEL);
3345     jmpb(TRUE_LABEL);
3346 
3347     bind(COMPARE_TAIL); // limit is zero
3348     movl(limit, result);
3349     // Fallthru to tail compare
3350   } else if (UseSSE42Intrinsics) {
3351     // With SSE4.2, use double quad vector compare
3352     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3353 
3354     // Compare 16-byte vectors
3355     andl(result, 0x0000000f);  //   tail count (in bytes)
3356     andl(limit, 0xfffffff0);   // vector count (in bytes)
3357     jcc(Assembler::zero, COMPARE_TAIL);
3358 
3359     lea(ary1, Address(ary1, limit, Address::times_1));
3360     lea(ary2, Address(ary2, limit, Address::times_1));
3361     negptr(limit);
3362 
3363     bind(COMPARE_WIDE_VECTORS);
3364     movdqu(vec1, Address(ary1, limit, Address::times_1));
3365     movdqu(vec2, Address(ary2, limit, Address::times_1));
3366     pxor(vec1, vec2);
3367 
3368     ptest(vec1, vec1);
3369     jcc(Assembler::notZero, FALSE_LABEL);
3370     addptr(limit, 16);
3371     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3372 
3373     testl(result, result);
3374     jcc(Assembler::zero, TRUE_LABEL);
3375 
3376     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3377     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3378     pxor(vec1, vec2);
3379 
3380     ptest(vec1, vec1);
3381     jccb(Assembler::notZero, FALSE_LABEL);
3382     jmpb(TRUE_LABEL);
3383 
3384     bind(COMPARE_TAIL); // limit is zero
3385     movl(limit, result);
3386     // Fallthru to tail compare
3387   }
3388 
3389   // Compare 4-byte vectors
3390   andl(limit, 0xfffffffc); // vector count (in bytes)
3391   jccb(Assembler::zero, COMPARE_CHAR);
3392 
3393   lea(ary1, Address(ary1, limit, Address::times_1));
3394   lea(ary2, Address(ary2, limit, Address::times_1));
3395   negptr(limit);
3396 
3397   bind(COMPARE_VECTORS);
3398   movl(chr, Address(ary1, limit, Address::times_1));
3399   cmpl(chr, Address(ary2, limit, Address::times_1));
3400   jccb(Assembler::notEqual, FALSE_LABEL);
3401   addptr(limit, 4);
3402   jcc(Assembler::notZero, COMPARE_VECTORS);
3403 
3404   // Compare trailing char (final 2 bytes), if any
3405   bind(COMPARE_CHAR);
3406   testl(result, 0x2);   // tail  char
3407   jccb(Assembler::zero, COMPARE_BYTE);
3408   load_unsigned_short(chr, Address(ary1, 0));
3409   load_unsigned_short(limit, Address(ary2, 0));
3410   cmpl(chr, limit);
3411   jccb(Assembler::notEqual, FALSE_LABEL);
3412 
3413   if (is_array_equ && is_char) {
3414     bind(COMPARE_BYTE);
3415   } else {
3416     lea(ary1, Address(ary1, 2));
3417     lea(ary2, Address(ary2, 2));
3418 
3419     bind(COMPARE_BYTE);
3420     testl(result, 0x1);   // tail  byte
3421     jccb(Assembler::zero, TRUE_LABEL);
3422     load_unsigned_byte(chr, Address(ary1, 0));
3423     load_unsigned_byte(limit, Address(ary2, 0));
3424     cmpl(chr, limit);
3425     jccb(Assembler::notEqual, FALSE_LABEL);
3426   }
3427   bind(TRUE_LABEL);
3428   movl(result, 1);   // return true
3429   jmpb(DONE);
3430 
3431   bind(FALSE_LABEL);
3432   xorl(result, result); // return false
3433 
3434   // That's it
3435   bind(DONE);
3436   if (UseAVX >= 2) {
3437     // clean upper bits of YMM registers
3438     vpxor(vec1, vec1);
3439     vpxor(vec2, vec2);
3440   }
3441 }