1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(k1, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask() {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(k1, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 237   assert(tmpReg == rax, "");
 238   assert(scrReg == rdx, "");
 239   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 240 
 241   if (RTMRetryCount > 0) {
 242     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 243     bind(L_rtm_retry);
 244   }
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 246   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 247   jcc(Assembler::notZero, IsInflated);
 248 
 249   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 250     Label L_noincrement;
 251     if (RTMTotalCountIncrRate > 1) {
 252       // tmpReg, scrReg and flags are killed
 253       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 254     }
 255     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 256     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 257     bind(L_noincrement);
 258   }
 259   xbegin(L_on_abort);
 260   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 261   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 262   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 263   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 264 
 265   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 266   if (UseRTMXendForLockBusy) {
 267     xend();
 268     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 269     jmp(L_decrement_retry);
 270   }
 271   else {
 272     xabort(0);
 273   }
 274   bind(L_on_abort);
 275   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 276     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 277   }
 278   bind(L_decrement_retry);
 279   if (RTMRetryCount > 0) {
 280     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 281     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 282   }
 283 }
 284 
 285 // Use RTM for inflating locks
 286 // inputs: objReg (object to lock)
 287 //         boxReg (on-stack box address (displaced header location) - KILLED)
 288 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 290                                             Register scrReg, Register retry_on_busy_count_Reg,
 291                                             Register retry_on_abort_count_Reg,
 292                                             RTMLockingCounters* rtm_counters,
 293                                             Metadata* method_data, bool profile_rtm,
 294                                             Label& DONE_LABEL) {
 295   assert(UseRTMLocking, "why call this otherwise?");
 296   assert(tmpReg == rax, "");
 297   assert(scrReg == rdx, "");
 298   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 299   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 300 
 301   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 302   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 303   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 304 
 305   if (RTMRetryCount > 0) {
 306     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 307     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 308     bind(L_rtm_retry);
 309   }
 310   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 311     Label L_noincrement;
 312     if (RTMTotalCountIncrRate > 1) {
 313       // tmpReg, scrReg and flags are killed
 314       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 315     }
 316     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 317     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 318     bind(L_noincrement);
 319   }
 320   xbegin(L_on_abort);
 321   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 322   movptr(tmpReg, Address(tmpReg, owner_offset));
 323   testptr(tmpReg, tmpReg);
 324   jcc(Assembler::zero, DONE_LABEL);
 325   if (UseRTMXendForLockBusy) {
 326     xend();
 327     jmp(L_decrement_retry);
 328   }
 329   else {
 330     xabort(0);
 331   }
 332   bind(L_on_abort);
 333   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 334   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 335     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 336   }
 337   if (RTMRetryCount > 0) {
 338     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 339     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 340   }
 341 
 342   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 343   testptr(tmpReg, tmpReg) ;
 344   jccb(Assembler::notZero, L_decrement_retry) ;
 345 
 346   // Appears unlocked - try to swing _owner from null to non-null.
 347   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 348 #ifdef _LP64
 349   Register threadReg = r15_thread;
 350 #else
 351   get_thread(scrReg);
 352   Register threadReg = scrReg;
 353 #endif
 354   lock();
 355   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 356 
 357   if (RTMRetryCount > 0) {
 358     // success done else retry
 359     jccb(Assembler::equal, DONE_LABEL) ;
 360     bind(L_decrement_retry);
 361     // Spin and retry if lock is busy.
 362     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 363   }
 364   else {
 365     bind(L_decrement_retry);
 366   }
 367 }
 368 
 369 #endif //  INCLUDE_RTM_OPT
 370 
 371 // fast_lock and fast_unlock used by C2
 372 
 373 // Because the transitions from emitted code to the runtime
 374 // monitorenter/exit helper stubs are so slow it's critical that
 375 // we inline both the stack-locking fast path and the inflated fast path.
 376 //
 377 // See also: cmpFastLock and cmpFastUnlock.
 378 //
 379 // What follows is a specialized inline transliteration of the code
 380 // in enter() and exit(). If we're concerned about I$ bloat another
 381 // option would be to emit TrySlowEnter and TrySlowExit methods
 382 // at startup-time.  These methods would accept arguments as
 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 384 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 386 // In practice, however, the # of lock sites is bounded and is usually small.
 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 388 // if the processor uses simple bimodal branch predictors keyed by EIP
 389 // Since the helper routines would be called from multiple synchronization
 390 // sites.
 391 //
 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 394 // to those specialized methods.  That'd give us a mostly platform-independent
 395 // implementation that the JITs could optimize and inline at their pleasure.
 396 // Done correctly, the only time we'd need to cross to native could would be
 397 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 399 // (b) explicit barriers or fence operations.
 400 //
 401 // TODO:
 402 //
 403 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 404 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 405 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 406 //    the lock operators would typically be faster than reifying Self.
 407 //
 408 // *  Ideally I'd define the primitives as:
 409 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 410 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 411 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 412 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 413 //    Furthermore the register assignments are overconstrained, possibly resulting in
 414 //    sub-optimal code near the synchronization site.
 415 //
 416 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 417 //    Alternately, use a better sp-proximity test.
 418 //
 419 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 420 //    Either one is sufficient to uniquely identify a thread.
 421 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 422 //
 423 // *  Intrinsify notify() and notifyAll() for the common cases where the
 424 //    object is locked by the calling thread but the waitlist is empty.
 425 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 426 //
 427 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 428 //    But beware of excessive branch density on AMD Opterons.
 429 //
 430 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 431 //    or failure of the fast path.  If the fast path fails then we pass
 432 //    control to the slow path, typically in C.  In fast_lock and
 433 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 434 //    will emit a conditional branch immediately after the node.
 435 //    So we have branches to branches and lots of ICC.ZF games.
 436 //    Instead, it might be better to have C2 pass a "FailureLabel"
 437 //    into fast_lock and fast_unlock.  In the case of success, control
 438 //    will drop through the node.  ICC.ZF is undefined at exit.
 439 //    In the case of failure, the node will branch directly to the
 440 //    FailureLabel
 441 
 442 
 443 // obj: object to lock
 444 // box: on-stack box address (displaced header location) - KILLED
 445 // rax,: tmp -- KILLED
 446 // scr: tmp -- KILLED
 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 448                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 449                                  BiasedLockingCounters* counters,
 450                                  RTMLockingCounters* rtm_counters,
 451                                  RTMLockingCounters* stack_rtm_counters,
 452                                  Metadata* method_data,
 453                                  bool use_rtm, bool profile_rtm) {
 454   // Ensure the register assignments are disjoint
 455   assert(tmpReg == rax, "");
 456 
 457   if (use_rtm) {
 458     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 459   } else {
 460     assert(cx1Reg == noreg, "");
 461     assert(cx2Reg == noreg, "");
 462     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 463   }
 464 
 465   if (counters != NULL) {
 466     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 467   }
 468 
 469   // Possible cases that we'll encounter in fast_lock
 470   // ------------------------------------------------
 471   // * Inflated
 472   //    -- unlocked
 473   //    -- Locked
 474   //       = by self
 475   //       = by other
 476   // * biased
 477   //    -- by Self
 478   //    -- by other
 479   // * neutral
 480   // * stack-locked
 481   //    -- by self
 482   //       = sp-proximity test hits
 483   //       = sp-proximity test generates false-negative
 484   //    -- by other
 485   //
 486 
 487   Label IsInflated, DONE_LABEL;
 488 
 489   // it's stack-locked, biased or neutral
 490   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 491   // order to reduce the number of conditional branches in the most common cases.
 492   // Beware -- there's a subtle invariant that fetch of the markword
 493   // at [FETCH], below, will never observe a biased encoding (*101b).
 494   // If this invariant is not held we risk exclusion (safety) failure.
 495   if (UseBiasedLocking && !UseOptoBiasInlining) {
 496     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
 497   }
 498 
 499 #if INCLUDE_RTM_OPT
 500   if (UseRTMForStackLocks && use_rtm) {
 501     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 502                       stack_rtm_counters, method_data, profile_rtm,
 503                       DONE_LABEL, IsInflated);
 504   }
 505 #endif // INCLUDE_RTM_OPT
 506 
 507   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 508   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 509   jccb(Assembler::notZero, IsInflated);
 510 
 511   // Attempt stack-locking ...
 512   orptr (tmpReg, markWord::unlocked_value);
 513   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 514   lock();
 515   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 516   if (counters != NULL) {
 517     cond_inc32(Assembler::equal,
 518                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 519   }
 520   jcc(Assembler::equal, DONE_LABEL);           // Success
 521 
 522   // Recursive locking.
 523   // The object is stack-locked: markword contains stack pointer to BasicLock.
 524   // Locked by current thread if difference with current SP is less than one page.
 525   subptr(tmpReg, rsp);
 526   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 527   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 528   movptr(Address(boxReg, 0), tmpReg);
 529   if (counters != NULL) {
 530     cond_inc32(Assembler::equal,
 531                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 532   }
 533   jmp(DONE_LABEL);
 534 
 535   bind(IsInflated);
 536   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 537 
 538 #if INCLUDE_RTM_OPT
 539   // Use the same RTM locking code in 32- and 64-bit VM.
 540   if (use_rtm) {
 541     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 542                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 543   } else {
 544 #endif // INCLUDE_RTM_OPT
 545 
 546 #ifndef _LP64
 547   // The object is inflated.
 548 
 549   // boxReg refers to the on-stack BasicLock in the current frame.
 550   // We'd like to write:
 551   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 552   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 553   // additional latency as we have another ST in the store buffer that must drain.
 554 
 555   // avoid ST-before-CAS
 556   // register juggle because we need tmpReg for cmpxchgptr below
 557   movptr(scrReg, boxReg);
 558   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 559 
 560   // Optimistic form: consider XORL tmpReg,tmpReg
 561   movptr(tmpReg, NULL_WORD);
 562 
 563   // Appears unlocked - try to swing _owner from null to non-null.
 564   // Ideally, I'd manifest "Self" with get_thread and then attempt
 565   // to CAS the register containing Self into m->Owner.
 566   // But we don't have enough registers, so instead we can either try to CAS
 567   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 568   // we later store "Self" into m->Owner.  Transiently storing a stack address
 569   // (rsp or the address of the box) into  m->owner is harmless.
 570   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 571   lock();
 572   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 573   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 574   // If we weren't able to swing _owner from NULL to the BasicLock
 575   // then take the slow path.
 576   jccb  (Assembler::notZero, DONE_LABEL);
 577   // update _owner from BasicLock to thread
 578   get_thread (scrReg);                    // beware: clobbers ICCs
 579   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 580   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 581 
 582   // If the CAS fails we can either retry or pass control to the slow path.
 583   // We use the latter tactic.
 584   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 585   // If the CAS was successful ...
 586   //   Self has acquired the lock
 587   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 588   // Intentional fall-through into DONE_LABEL ...
 589 #else // _LP64
 590   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 591   movq(scrReg, tmpReg);
 592   xorq(tmpReg, tmpReg);
 593   lock();
 594   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 595   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 596   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 597   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 598   // Intentional fall-through into DONE_LABEL ...
 599   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 600 #endif // _LP64
 601 #if INCLUDE_RTM_OPT
 602   } // use_rtm()
 603 #endif
 604   // DONE_LABEL is a hot target - we'd really like to place it at the
 605   // start of cache line by padding with NOPs.
 606   // See the AMD and Intel software optimization manuals for the
 607   // most efficient "long" NOP encodings.
 608   // Unfortunately none of our alignment mechanisms suffice.
 609   bind(DONE_LABEL);
 610 
 611   // At DONE_LABEL the icc ZFlag is set as follows ...
 612   // fast_unlock uses the same protocol.
 613   // ZFlag == 1 -> Success
 614   // ZFlag == 0 -> Failure - force control through the slow path
 615 }
 616 
 617 // obj: object to unlock
 618 // box: box address (displaced header location), killed.  Must be EAX.
 619 // tmp: killed, cannot be obj nor box.
 620 //
 621 // Some commentary on balanced locking:
 622 //
 623 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 624 // Methods that don't have provably balanced locking are forced to run in the
 625 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 626 // The interpreter provides two properties:
 627 // I1:  At return-time the interpreter automatically and quietly unlocks any
 628 //      objects acquired the current activation (frame).  Recall that the
 629 //      interpreter maintains an on-stack list of locks currently held by
 630 //      a frame.
 631 // I2:  If a method attempts to unlock an object that is not held by the
 632 //      the frame the interpreter throws IMSX.
 633 //
 634 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 635 // B() doesn't have provably balanced locking so it runs in the interpreter.
 636 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 637 // is still locked by A().
 638 //
 639 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 640 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 641 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 642 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 643 // Arguably given that the spec legislates the JNI case as undefined our implementation
 644 // could reasonably *avoid* checking owner in fast_unlock().
 645 // In the interest of performance we elide m->Owner==Self check in unlock.
 646 // A perfectly viable alternative is to elide the owner check except when
 647 // Xcheck:jni is enabled.
 648 
 649 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 650   assert(boxReg == rax, "");
 651   assert_different_registers(objReg, boxReg, tmpReg);
 652 
 653   Label DONE_LABEL, Stacked, CheckSucc;
 654 
 655   // Critically, the biased locking test must have precedence over
 656   // and appear before the (box->dhw == 0) recursive stack-lock test.
 657   if (UseBiasedLocking && !UseOptoBiasInlining) {
 658     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 659   }
 660 
 661 #if INCLUDE_RTM_OPT
 662   if (UseRTMForStackLocks && use_rtm) {
 663     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 664     Label L_regular_unlock;
 665     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 666     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 667     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 668     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 669     xend();                                                           // otherwise end...
 670     jmp(DONE_LABEL);                                                  // ... and we're done
 671     bind(L_regular_unlock);
 672   }
 673 #endif
 674 
 675   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 676   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 677   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 678   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 679   jccb  (Assembler::zero, Stacked);
 680 
 681   // It's inflated.
 682 #if INCLUDE_RTM_OPT
 683   if (use_rtm) {
 684     Label L_regular_inflated_unlock;
 685     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 686     movptr(boxReg, Address(tmpReg, owner_offset));
 687     testptr(boxReg, boxReg);
 688     jccb(Assembler::notZero, L_regular_inflated_unlock);
 689     xend();
 690     jmpb(DONE_LABEL);
 691     bind(L_regular_inflated_unlock);
 692   }
 693 #endif
 694 
 695   // Despite our balanced locking property we still check that m->_owner == Self
 696   // as java routines or native JNI code called by this thread might
 697   // have released the lock.
 698   // Refer to the comments in synchronizer.cpp for how we might encode extra
 699   // state in _succ so we can avoid fetching EntryList|cxq.
 700   //
 701   // I'd like to add more cases in fast_lock() and fast_unlock() --
 702   // such as recursive enter and exit -- but we have to be wary of
 703   // I$ bloat, T$ effects and BP$ effects.
 704   //
 705   // If there's no contention try a 1-0 exit.  That is, exit without
 706   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 707   // we detect and recover from the race that the 1-0 exit admits.
 708   //
 709   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 710   // before it STs null into _owner, releasing the lock.  Updates
 711   // to data protected by the critical section must be visible before
 712   // we drop the lock (and thus before any other thread could acquire
 713   // the lock and observe the fields protected by the lock).
 714   // IA32's memory-model is SPO, so STs are ordered with respect to
 715   // each other and there's no need for an explicit barrier (fence).
 716   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 717 #ifndef _LP64
 718   get_thread (boxReg);
 719 
 720   // Note that we could employ various encoding schemes to reduce
 721   // the number of loads below (currently 4) to just 2 or 3.
 722   // Refer to the comments in synchronizer.cpp.
 723   // In practice the chain of fetches doesn't seem to impact performance, however.
 724   xorptr(boxReg, boxReg);
 725   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 726   jccb  (Assembler::notZero, DONE_LABEL);
 727   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 728   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 729   jccb  (Assembler::notZero, CheckSucc);
 730   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 731   jmpb  (DONE_LABEL);
 732 
 733   bind (Stacked);
 734   // It's not inflated and it's not recursively stack-locked and it's not biased.
 735   // It must be stack-locked.
 736   // Try to reset the header to displaced header.
 737   // The "box" value on the stack is stable, so we can reload
 738   // and be assured we observe the same value as above.
 739   movptr(tmpReg, Address(boxReg, 0));
 740   lock();
 741   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 742   // Intention fall-thru into DONE_LABEL
 743 
 744   // DONE_LABEL is a hot target - we'd really like to place it at the
 745   // start of cache line by padding with NOPs.
 746   // See the AMD and Intel software optimization manuals for the
 747   // most efficient "long" NOP encodings.
 748   // Unfortunately none of our alignment mechanisms suffice.
 749   bind (CheckSucc);
 750 #else // _LP64
 751   // It's inflated
 752   xorptr(boxReg, boxReg);
 753   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 754   jccb  (Assembler::notZero, DONE_LABEL);
 755   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 756   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 757   jccb  (Assembler::notZero, CheckSucc);
 758   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 759   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 760   jmpb  (DONE_LABEL);
 761 
 762   // Try to avoid passing control into the slow_path ...
 763   Label LSuccess, LGoSlowPath ;
 764   bind  (CheckSucc);
 765 
 766   // The following optional optimization can be elided if necessary
 767   // Effectively: if (succ == null) goto slow path
 768   // The code reduces the window for a race, however,
 769   // and thus benefits performance.
 770   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 771   jccb  (Assembler::zero, LGoSlowPath);
 772 
 773   xorptr(boxReg, boxReg);
 774   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 775   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 776 
 777   // Memory barrier/fence
 778   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 779   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 780   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 781   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 782   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 783   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 784   lock(); addl(Address(rsp, 0), 0);
 785 
 786   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 787   jccb  (Assembler::notZero, LSuccess);
 788 
 789   // Rare inopportune interleaving - race.
 790   // The successor vanished in the small window above.
 791   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 792   // We need to ensure progress and succession.
 793   // Try to reacquire the lock.
 794   // If that fails then the new owner is responsible for succession and this
 795   // thread needs to take no further action and can exit via the fast path (success).
 796   // If the re-acquire succeeds then pass control into the slow path.
 797   // As implemented, this latter mode is horrible because we generated more
 798   // coherence traffic on the lock *and* artifically extended the critical section
 799   // length while by virtue of passing control into the slow path.
 800 
 801   // box is really RAX -- the following CMPXCHG depends on that binding
 802   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 803   lock();
 804   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 805   // There's no successor so we tried to regrab the lock.
 806   // If that didn't work, then another thread grabbed the
 807   // lock so we're done (and exit was a success).
 808   jccb  (Assembler::notEqual, LSuccess);
 809   // Intentional fall-through into slow path
 810 
 811   bind  (LGoSlowPath);
 812   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 813   jmpb  (DONE_LABEL);
 814 
 815   bind  (LSuccess);
 816   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 817   jmpb  (DONE_LABEL);
 818 
 819   bind  (Stacked);
 820   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 821   lock();
 822   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 823 
 824 #endif
 825   bind(DONE_LABEL);
 826 }
 827 
 828 //-------------------------------------------------------------------------------------------
 829 // Generic instructions support for use in .ad files C2 code generation
 830 
 831 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 832   if (dst != src) {
 833     movdqu(dst, src);
 834   }
 835   if (opcode == Op_AbsVD) {
 836     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 837   } else {
 838     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 839     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 840   }
 841 }
 842 
 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 844   if (opcode == Op_AbsVD) {
 845     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 846   } else {
 847     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 848     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 849   }
 850 }
 851 
 852 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 853   if (dst != src) {
 854     movdqu(dst, src);
 855   }
 856   if (opcode == Op_AbsVF) {
 857     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 858   } else {
 859     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 860     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 861   }
 862 }
 863 
 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 865   if (opcode == Op_AbsVF) {
 866     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 867   } else {
 868     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 869     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 870   }
 871 }
 872 
 873 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 874   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 875 
 876   if (opcode == Op_MinV) {
 877     if (elem_bt == T_BYTE) {
 878       pminsb(dst, src);
 879     } else if (elem_bt == T_SHORT) {
 880       pminsw(dst, src);
 881     } else if (elem_bt == T_INT) {
 882       pminsd(dst, src);
 883     } else {
 884       assert(elem_bt == T_LONG, "required");
 885       assert(tmp == xmm0, "required");
 886       movdqu(xmm0, dst);
 887       pcmpgtq(xmm0, src);
 888       blendvpd(dst, src);  // xmm0 as mask
 889     }
 890   } else { // opcode == Op_MaxV
 891     if (elem_bt == T_BYTE) {
 892       pmaxsb(dst, src);
 893     } else if (elem_bt == T_SHORT) {
 894       pmaxsw(dst, src);
 895     } else if (elem_bt == T_INT) {
 896       pmaxsd(dst, src);
 897     } else {
 898       assert(elem_bt == T_LONG, "required");
 899       assert(tmp == xmm0, "required");
 900       movdqu(xmm0, src);
 901       pcmpgtq(xmm0, dst);
 902       blendvpd(dst, src);  // xmm0 as mask
 903     }
 904   }
 905 }
 906 
 907 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 908                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 909                                  int vlen_enc) {
 910   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 911 
 912   if (opcode == Op_MinV) {
 913     if (elem_bt == T_BYTE) {
 914       vpminsb(dst, src1, src2, vlen_enc);
 915     } else if (elem_bt == T_SHORT) {
 916       vpminsw(dst, src1, src2, vlen_enc);
 917     } else if (elem_bt == T_INT) {
 918       vpminsd(dst, src1, src2, vlen_enc);
 919     } else {
 920       assert(elem_bt == T_LONG, "required");
 921       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 922         vpminsq(dst, src1, src2, vlen_enc);
 923       } else {
 924         vpcmpgtq(dst, src1, src2, vlen_enc);
 925         vblendvpd(dst, src1, src2, dst, vlen_enc);
 926       }
 927     }
 928   } else { // opcode == Op_MaxV
 929     if (elem_bt == T_BYTE) {
 930       vpmaxsb(dst, src1, src2, vlen_enc);
 931     } else if (elem_bt == T_SHORT) {
 932       vpmaxsw(dst, src1, src2, vlen_enc);
 933     } else if (elem_bt == T_INT) {
 934       vpmaxsd(dst, src1, src2, vlen_enc);
 935     } else {
 936       assert(elem_bt == T_LONG, "required");
 937       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 938         vpmaxsq(dst, src1, src2, vlen_enc);
 939       } else {
 940         vpcmpgtq(dst, src1, src2, vlen_enc);
 941         vblendvpd(dst, src2, src1, dst, vlen_enc);
 942       }
 943     }
 944   }
 945 }
 946 
 947 // Float/Double min max
 948 
 949 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 950                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 951                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 952                                    int vlen_enc) {
 953   assert(UseAVX > 0, "required");
 954   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 955          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 956   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 957 
 958   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 959   bool is_double_word = is_double_word_type(elem_bt);
 960 
 961   if (!is_double_word && is_min) {
 962     vblendvps(atmp, a, b, a, vlen_enc);
 963     vblendvps(btmp, b, a, a, vlen_enc);
 964     vminps(tmp, atmp, btmp, vlen_enc);
 965     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 966     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 967   } else if (!is_double_word && !is_min) {
 968     vblendvps(btmp, b, a, b, vlen_enc);
 969     vblendvps(atmp, a, b, b, vlen_enc);
 970     vmaxps(tmp, atmp, btmp, vlen_enc);
 971     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 972     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 973   } else if (is_double_word && is_min) {
 974     vblendvpd(atmp, a, b, a, vlen_enc);
 975     vblendvpd(btmp, b, a, a, vlen_enc);
 976     vminpd(tmp, atmp, btmp, vlen_enc);
 977     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 978     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 979   } else {
 980     assert(is_double_word && !is_min, "sanity");
 981     vblendvpd(btmp, b, a, b, vlen_enc);
 982     vblendvpd(atmp, a, b, b, vlen_enc);
 983     vmaxpd(tmp, atmp, btmp, vlen_enc);
 984     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 985     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 986   }
 987 }
 988 
 989 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 990                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 991                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 992                                     int vlen_enc) {
 993   assert(UseAVX > 2, "required");
 994   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 995          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 996   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 997 
 998   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 999   bool is_double_word = is_double_word_type(elem_bt);
1000   bool merge = true;
1001 
1002   if (!is_double_word && is_min) {
1003     evpmovd2m(ktmp, a, vlen_enc);
1004     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1005     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1006     vminps(dst, atmp, btmp, vlen_enc);
1007     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1008     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1009   } else if (!is_double_word && !is_min) {
1010     evpmovd2m(ktmp, b, vlen_enc);
1011     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1012     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1013     vmaxps(dst, atmp, btmp, vlen_enc);
1014     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1015     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1016   } else if (is_double_word && is_min) {
1017     evpmovq2m(ktmp, a, vlen_enc);
1018     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1019     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1020     vminpd(dst, atmp, btmp, vlen_enc);
1021     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1022     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1023   } else {
1024     assert(is_double_word && !is_min, "sanity");
1025     evpmovq2m(ktmp, b, vlen_enc);
1026     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1027     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1028     vmaxpd(dst, atmp, btmp, vlen_enc);
1029     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1030     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1031   }
1032 }
1033 
1034 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1035   if (sign) {
1036     pmovsxbw(dst, src);
1037   } else {
1038     pmovzxbw(dst, src);
1039   }
1040 }
1041 
1042 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1043   if (sign) {
1044     vpmovsxbw(dst, src, vector_len);
1045   } else {
1046     vpmovzxbw(dst, src, vector_len);
1047   }
1048 }
1049 
1050 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1051   if (sign) {
1052     vpmovsxbd(dst, src, vector_len);
1053   } else {
1054     vpmovzxbd(dst, src, vector_len);
1055   }
1056 }
1057 
1058 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1059   if (sign) {
1060     vpmovsxwd(dst, src, vector_len);
1061   } else {
1062     vpmovzxwd(dst, src, vector_len);
1063   }
1064 }
1065 
1066 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1067   switch (opcode) {
1068     case Op_RShiftVI:  psrad(dst, shift); break;
1069     case Op_LShiftVI:  pslld(dst, shift); break;
1070     case Op_URShiftVI: psrld(dst, shift); break;
1071 
1072     default: assert(false, "%s", NodeClassNames[opcode]);
1073   }
1074 }
1075 
1076 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1077   switch (opcode) {
1078     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1079     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1080     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1081 
1082     default: assert(false, "%s", NodeClassNames[opcode]);
1083   }
1084 }
1085 
1086 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1087   switch (opcode) {
1088     case Op_RShiftVB:  // fall-through
1089     case Op_RShiftVS:  psraw(dst, shift); break;
1090 
1091     case Op_LShiftVB:  // fall-through
1092     case Op_LShiftVS:  psllw(dst, shift);   break;
1093 
1094     case Op_URShiftVS: // fall-through
1095     case Op_URShiftVB: psrlw(dst, shift);  break;
1096 
1097     default: assert(false, "%s", NodeClassNames[opcode]);
1098   }
1099 }
1100 
1101 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1102   switch (opcode) {
1103     case Op_RShiftVB:  // fall-through
1104     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1105 
1106     case Op_LShiftVB:  // fall-through
1107     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1108 
1109     case Op_URShiftVS: // fall-through
1110     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1111 
1112     default: assert(false, "%s", NodeClassNames[opcode]);
1113   }
1114 }
1115 
1116 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1117   switch (opcode) {
1118     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1119     case Op_LShiftVL:  psllq(dst, shift); break;
1120     case Op_URShiftVL: psrlq(dst, shift); break;
1121 
1122     default: assert(false, "%s", NodeClassNames[opcode]);
1123   }
1124 }
1125 
1126 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1127   switch (opcode) {
1128     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1129     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1130     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1131 
1132     default: assert(false, "%s", NodeClassNames[opcode]);
1133   }
1134 }
1135 
1136 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1137   switch (opcode) {
1138     case Op_VRShiftV:  vpsravd(dst, src, shift, vlen_enc); break;
1139     case Op_VLShiftV:  vpsllvd(dst, src, shift, vlen_enc); break;
1140     case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break;
1141 
1142     default: assert(false, "%s", NodeClassNames[opcode]);
1143   }
1144 }
1145 
1146 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1147   switch (opcode) {
1148     case Op_VRShiftV:  evpsravw(dst, src, shift, vlen_enc); break;
1149     case Op_VLShiftV:  evpsllvw(dst, src, shift, vlen_enc); break;
1150     case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break;
1151 
1152     default: assert(false, "%s", NodeClassNames[opcode]);
1153   }
1154 }
1155 
1156 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1157   assert(UseAVX >= 2, "required");
1158   switch (opcode) {
1159     case Op_VRShiftV: {
1160       if (UseAVX > 2) {
1161         assert(tmp == xnoreg, "not used");
1162         if (!VM_Version::supports_avx512vl()) {
1163           vlen_enc = Assembler::AVX_512bit;
1164         }
1165         evpsravq(dst, src, shift, vlen_enc);
1166       } else {
1167         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1168         vpsrlvq(dst, src, shift, vlen_enc);
1169         vpsrlvq(tmp, tmp, shift, vlen_enc);
1170         vpxor(dst, dst, tmp, vlen_enc);
1171         vpsubq(dst, dst, tmp, vlen_enc);
1172       }
1173       break;
1174     }
1175     case Op_VLShiftV: {
1176       assert(tmp == xnoreg, "not used");
1177       vpsllvq(dst, src, shift, vlen_enc);
1178       break;
1179     }
1180     case Op_VURShiftV: {
1181       assert(tmp == xnoreg, "not used");
1182       vpsrlvq(dst, src, shift, vlen_enc);
1183       break;
1184     }
1185     default: assert(false, "%s", NodeClassNames[opcode]);
1186   }
1187 }
1188 
1189 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1190 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1191   bool sign = (opcode == Op_VURShiftV) ? false : true;
1192   assert(vector_len == 0, "required");
1193   vextendbd(sign, dst, src, 1);
1194   vpmovzxbd(vtmp, shift, 1);
1195   varshiftd(opcode, dst, dst, vtmp, 1);
1196   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1197   vextracti128_high(vtmp, dst);
1198   vpackusdw(dst, dst, vtmp, 0);
1199 }
1200 
1201 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1202 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1203   bool sign = (opcode == Op_VURShiftV) ? false : true;
1204   int ext_vector_len = vector_len + 1;
1205   vextendbw(sign, dst, src, ext_vector_len);
1206   vpmovzxbw(vtmp, shift, ext_vector_len);
1207   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1208   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1209   if (vector_len == 0) {
1210     vextracti128_high(vtmp, dst);
1211     vpackuswb(dst, dst, vtmp, vector_len);
1212   } else {
1213     vextracti64x4_high(vtmp, dst);
1214     vpackuswb(dst, dst, vtmp, vector_len);
1215     vpermq(dst, dst, 0xD8, vector_len);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1220   switch(typ) {
1221     case T_BYTE:
1222       pinsrb(dst, val, idx);
1223       break;
1224     case T_SHORT:
1225       pinsrw(dst, val, idx);
1226       break;
1227     case T_INT:
1228       pinsrd(dst, val, idx);
1229       break;
1230     case T_LONG:
1231       pinsrq(dst, val, idx);
1232       break;
1233     default:
1234       assert(false,"Should not reach here.");
1235       break;
1236   }
1237 }
1238 
1239 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1240   switch(typ) {
1241     case T_BYTE:
1242       vpinsrb(dst, src, val, idx);
1243       break;
1244     case T_SHORT:
1245       vpinsrw(dst, src, val, idx);
1246       break;
1247     case T_INT:
1248       vpinsrd(dst, src, val, idx);
1249       break;
1250     case T_LONG:
1251       vpinsrq(dst, src, val, idx);
1252       break;
1253     default:
1254       assert(false,"Should not reach here.");
1255       break;
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1260   switch(typ) {
1261     case T_INT:
1262       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1263       break;
1264     case T_FLOAT:
1265       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1266       break;
1267     case T_LONG:
1268       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1269       break;
1270     case T_DOUBLE:
1271       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1272       break;
1273     default:
1274       assert(false,"Should not reach here.");
1275       break;
1276   }
1277 }
1278 
1279 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1280   switch(typ) {
1281     case T_INT:
1282       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1283       break;
1284     case T_FLOAT:
1285       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1286       break;
1287     case T_LONG:
1288       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1289       break;
1290     case T_DOUBLE:
1291       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1292       break;
1293     default:
1294       assert(false,"Should not reach here.");
1295       break;
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1300   switch(typ) {
1301     case T_INT:
1302       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1303       break;
1304     case T_FLOAT:
1305       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1306       break;
1307     case T_LONG:
1308       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1309       break;
1310     case T_DOUBLE:
1311       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1312       break;
1313     default:
1314       assert(false,"Should not reach here.");
1315       break;
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1320   if (vlen_in_bytes <= 16) {
1321     pxor (dst, dst);
1322     psubb(dst, src);
1323     switch (elem_bt) {
1324       case T_BYTE:   /* nothing to do */ break;
1325       case T_SHORT:  pmovsxbw(dst, dst); break;
1326       case T_INT:    pmovsxbd(dst, dst); break;
1327       case T_FLOAT:  pmovsxbd(dst, dst); break;
1328       case T_LONG:   pmovsxbq(dst, dst); break;
1329       case T_DOUBLE: pmovsxbq(dst, dst); break;
1330 
1331       default: assert(false, "%s", type2name(elem_bt));
1332     }
1333   } else {
1334     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1335 
1336     vpxor (dst, dst, dst, vlen_enc);
1337     vpsubb(dst, dst, src, vlen_enc);
1338     switch (elem_bt) {
1339       case T_BYTE:   /* nothing to do */            break;
1340       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1341       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1342       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1343       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1344       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1345 
1346       default: assert(false, "%s", type2name(elem_bt));
1347     }
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1352   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1353   if (vlen_in_bytes <= 16) {
1354     movdqu(dst, addr, scratch);
1355   } else if (vlen_in_bytes == 32) {
1356     vmovdqu(dst, addr, scratch);
1357   } else {
1358     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1359     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1360   }
1361 }
1362 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1363 
1364 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1365   int vector_len = Assembler::AVX_128bit;
1366 
1367   switch (opcode) {
1368     case Op_AndReductionV:  pand(dst, src); break;
1369     case Op_OrReductionV:   por (dst, src); break;
1370     case Op_XorReductionV:  pxor(dst, src); break;
1371     case Op_MinReductionV:
1372       switch (typ) {
1373         case T_BYTE:        pminsb(dst, src); break;
1374         case T_SHORT:       pminsw(dst, src); break;
1375         case T_INT:         pminsd(dst, src); break;
1376         case T_LONG:        assert(UseAVX > 2, "required");
1377                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1378         default:            assert(false, "wrong type");
1379       }
1380       break;
1381     case Op_MaxReductionV:
1382       switch (typ) {
1383         case T_BYTE:        pmaxsb(dst, src); break;
1384         case T_SHORT:       pmaxsw(dst, src); break;
1385         case T_INT:         pmaxsd(dst, src); break;
1386         case T_LONG:        assert(UseAVX > 2, "required");
1387                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1388         default:            assert(false, "wrong type");
1389       }
1390       break;
1391     case Op_AddReductionVF: addss(dst, src); break;
1392     case Op_AddReductionVD: addsd(dst, src); break;
1393     case Op_AddReductionVI:
1394       switch (typ) {
1395         case T_BYTE:        paddb(dst, src); break;
1396         case T_SHORT:       paddw(dst, src); break;
1397         case T_INT:         paddd(dst, src); break;
1398         default:            assert(false, "wrong type");
1399       }
1400       break;
1401     case Op_AddReductionVL: paddq(dst, src); break;
1402     case Op_MulReductionVF: mulss(dst, src); break;
1403     case Op_MulReductionVD: mulsd(dst, src); break;
1404     case Op_MulReductionVI:
1405       switch (typ) {
1406         case T_SHORT:       pmullw(dst, src); break;
1407         case T_INT:         pmulld(dst, src); break;
1408         default:            assert(false, "wrong type");
1409       }
1410       break;
1411     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1412                             vpmullq(dst, dst, src, vector_len); break;
1413     default:                assert(false, "wrong opcode");
1414   }
1415 }
1416 
1417 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1418   int vector_len = Assembler::AVX_256bit;
1419 
1420   switch (opcode) {
1421     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1422     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1423     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1424     case Op_MinReductionV:
1425       switch (typ) {
1426         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1427         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1428         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1429         case T_LONG:        assert(UseAVX > 2, "required");
1430                             vpminsq(dst, src1, src2, vector_len); break;
1431         default:            assert(false, "wrong type");
1432       }
1433       break;
1434     case Op_MaxReductionV:
1435       switch (typ) {
1436         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1437         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1438         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1439         case T_LONG:        assert(UseAVX > 2, "required");
1440                             vpmaxsq(dst, src1, src2, vector_len); break;
1441         default:            assert(false, "wrong type");
1442       }
1443       break;
1444     case Op_AddReductionVI:
1445       switch (typ) {
1446         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1447         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1448         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1449         default:            assert(false, "wrong type");
1450       }
1451       break;
1452     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1453     case Op_MulReductionVI:
1454       switch (typ) {
1455         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1456         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1457         default:            assert(false, "wrong type");
1458       }
1459       break;
1460     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1461     default:                assert(false, "wrong opcode");
1462   }
1463 }
1464 
1465 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1466                                   XMMRegister dst, XMMRegister src,
1467                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1468   switch (opcode) {
1469     case Op_AddReductionVF:
1470     case Op_MulReductionVF:
1471       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1472       break;
1473 
1474     case Op_AddReductionVD:
1475     case Op_MulReductionVD:
1476       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1477       break;
1478 
1479     default: assert(false, "wrong opcode");
1480   }
1481 }
1482 
1483 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1484                              Register dst, Register src1, XMMRegister src2,
1485                              XMMRegister vtmp1, XMMRegister vtmp2) {
1486   switch (vlen) {
1487     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1488     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1489     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1490     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1491 
1492     default: assert(false, "wrong vector length");
1493   }
1494 }
1495 
1496 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1497                              Register dst, Register src1, XMMRegister src2,
1498                              XMMRegister vtmp1, XMMRegister vtmp2) {
1499   switch (vlen) {
1500     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1501     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1502     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1503     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1504 
1505     default: assert(false, "wrong vector length");
1506   }
1507 }
1508 
1509 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1510                              Register dst, Register src1, XMMRegister src2,
1511                              XMMRegister vtmp1, XMMRegister vtmp2) {
1512   switch (vlen) {
1513     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1514     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1515     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1516     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1517 
1518     default: assert(false, "wrong vector length");
1519   }
1520 }
1521 
1522 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1523                              Register dst, Register src1, XMMRegister src2,
1524                              XMMRegister vtmp1, XMMRegister vtmp2) {
1525   switch (vlen) {
1526     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1527     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1528     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1529     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1530 
1531     default: assert(false, "wrong vector length");
1532   }
1533 }
1534 
1535 #ifdef _LP64
1536 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1537                              Register dst, Register src1, XMMRegister src2,
1538                              XMMRegister vtmp1, XMMRegister vtmp2) {
1539   switch (vlen) {
1540     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1541     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1542     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1543 
1544     default: assert(false, "wrong vector length");
1545   }
1546 }
1547 #endif // _LP64
1548 
1549 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1550   switch (vlen) {
1551     case 2:
1552       assert(vtmp2 == xnoreg, "");
1553       reduce2F(opcode, dst, src, vtmp1);
1554       break;
1555     case 4:
1556       assert(vtmp2 == xnoreg, "");
1557       reduce4F(opcode, dst, src, vtmp1);
1558       break;
1559     case 8:
1560       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1561       break;
1562     case 16:
1563       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1564       break;
1565     default: assert(false, "wrong vector length");
1566   }
1567 }
1568 
1569 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1570   switch (vlen) {
1571     case 2:
1572       assert(vtmp2 == xnoreg, "");
1573       reduce2D(opcode, dst, src, vtmp1);
1574       break;
1575     case 4:
1576       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1577       break;
1578     case 8:
1579       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1580       break;
1581     default: assert(false, "wrong vector length");
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1586   if (opcode == Op_AddReductionVI) {
1587     if (vtmp1 != src2) {
1588       movdqu(vtmp1, src2);
1589     }
1590     phaddd(vtmp1, vtmp1);
1591   } else {
1592     pshufd(vtmp1, src2, 0x1);
1593     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1594   }
1595   movdl(vtmp2, src1);
1596   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1597   movdl(dst, vtmp1);
1598 }
1599 
1600 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1601   if (opcode == Op_AddReductionVI) {
1602     if (vtmp1 != src2) {
1603       movdqu(vtmp1, src2);
1604     }
1605     phaddd(vtmp1, src2);
1606     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1607   } else {
1608     pshufd(vtmp2, src2, 0xE);
1609     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1610     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1615   if (opcode == Op_AddReductionVI) {
1616     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1617     vextracti128_high(vtmp2, vtmp1);
1618     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1619     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1620   } else {
1621     vextracti128_high(vtmp1, src2);
1622     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1623     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1624   }
1625 }
1626 
1627 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1628   vextracti64x4_high(vtmp2, src2);
1629   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1630   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1631 }
1632 
1633 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1634   pshufd(vtmp2, src2, 0x1);
1635   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1636   movdqu(vtmp1, vtmp2);
1637   psrldq(vtmp1, 2);
1638   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1639   movdqu(vtmp2, vtmp1);
1640   psrldq(vtmp2, 1);
1641   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1642   movdl(vtmp2, src1);
1643   pmovsxbd(vtmp1, vtmp1);
1644   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1645   pextrb(dst, vtmp1, 0x0);
1646   movsbl(dst, dst);
1647 }
1648 
1649 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1650   pshufd(vtmp1, src2, 0xE);
1651   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1652   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1653 }
1654 
1655 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1656   vextracti128_high(vtmp2, src2);
1657   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1658   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1659 }
1660 
1661 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1662   vextracti64x4_high(vtmp1, src2);
1663   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1664   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1665 }
1666 
1667 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1668   pmovsxbw(vtmp2, src2);
1669   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1670 }
1671 
1672 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1673   if (UseAVX > 1) {
1674     int vector_len = Assembler::AVX_256bit;
1675     vpmovsxbw(vtmp1, src2, vector_len);
1676     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1677   } else {
1678     pmovsxbw(vtmp2, src2);
1679     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1680     pshufd(vtmp2, src2, 0x1);
1681     pmovsxbw(vtmp2, src2);
1682     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1683   }
1684 }
1685 
1686 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1687   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1688     int vector_len = Assembler::AVX_512bit;
1689     vpmovsxbw(vtmp1, src2, vector_len);
1690     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1691   } else {
1692     assert(UseAVX >= 2,"Should not reach here.");
1693     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1694     vextracti128_high(vtmp2, src2);
1695     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1696   }
1697 }
1698 
1699 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1700   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1701   vextracti64x4_high(vtmp2, src2);
1702   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1703 }
1704 
1705 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1706   if (opcode == Op_AddReductionVI) {
1707     if (vtmp1 != src2) {
1708       movdqu(vtmp1, src2);
1709     }
1710     phaddw(vtmp1, vtmp1);
1711     phaddw(vtmp1, vtmp1);
1712   } else {
1713     pshufd(vtmp2, src2, 0x1);
1714     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1715     movdqu(vtmp1, vtmp2);
1716     psrldq(vtmp1, 2);
1717     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1718   }
1719   movdl(vtmp2, src1);
1720   pmovsxwd(vtmp1, vtmp1);
1721   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1722   pextrw(dst, vtmp1, 0x0);
1723   movswl(dst, dst);
1724 }
1725 
1726 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1727   if (opcode == Op_AddReductionVI) {
1728     if (vtmp1 != src2) {
1729       movdqu(vtmp1, src2);
1730     }
1731     phaddw(vtmp1, src2);
1732   } else {
1733     pshufd(vtmp1, src2, 0xE);
1734     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1735   }
1736   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1737 }
1738 
1739 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1740   if (opcode == Op_AddReductionVI) {
1741     int vector_len = Assembler::AVX_256bit;
1742     vphaddw(vtmp2, src2, src2, vector_len);
1743     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1744   } else {
1745     vextracti128_high(vtmp2, src2);
1746     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1747   }
1748   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1749 }
1750 
1751 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1752   int vector_len = Assembler::AVX_256bit;
1753   vextracti64x4_high(vtmp1, src2);
1754   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1755   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1756 }
1757 
1758 #ifdef _LP64
1759 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1760   pshufd(vtmp2, src2, 0xE);
1761   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1762   movdq(vtmp1, src1);
1763   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1764   movdq(dst, vtmp1);
1765 }
1766 
1767 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1768   vextracti128_high(vtmp1, src2);
1769   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1770   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1771 }
1772 
1773 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1774   vextracti64x4_high(vtmp2, src2);
1775   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1776   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1777 }
1778 #endif // _LP64
1779 
1780 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1781   reduce_operation_128(T_FLOAT, opcode, dst, src);
1782   pshufd(vtmp, src, 0x1);
1783   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1784 }
1785 
1786 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1787   reduce2F(opcode, dst, src, vtmp);
1788   pshufd(vtmp, src, 0x2);
1789   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1790   pshufd(vtmp, src, 0x3);
1791   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1792 }
1793 
1794 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1795   reduce4F(opcode, dst, src, vtmp2);
1796   vextractf128_high(vtmp2, src);
1797   reduce4F(opcode, dst, vtmp2, vtmp1);
1798 }
1799 
1800 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1801   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1802   vextracti64x4_high(vtmp1, src);
1803   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1804 }
1805 
1806 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1807   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1808   pshufd(vtmp, src, 0xE);
1809   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1810 }
1811 
1812 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1813   reduce2D(opcode, dst, src, vtmp2);
1814   vextractf128_high(vtmp2, src);
1815   reduce2D(opcode, dst, vtmp2, vtmp1);
1816 }
1817 
1818 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1819   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1820   vextracti64x4_high(vtmp1, src);
1821   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1822 }
1823 
1824 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1825                                           XMMRegister dst, XMMRegister src,
1826                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1827                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1828   int permconst[] = {1, 14};
1829   XMMRegister wsrc = src;
1830   XMMRegister wdst = xmm_0;
1831   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1832 
1833   int vlen_enc = Assembler::AVX_128bit;
1834   if (vlen == 16) {
1835     vlen_enc = Assembler::AVX_256bit;
1836   }
1837 
1838   for (int i = log2(vlen) - 1; i >=0; i--) {
1839     if (i == 0 && !is_dst_valid) {
1840       wdst = dst;
1841     }
1842     if (i == 3) {
1843       vextracti64x4_high(wtmp, wsrc);
1844     } else if (i == 2) {
1845       vextracti128_high(wtmp, wsrc);
1846     } else { // i = [0,1]
1847       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1848     }
1849     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1850     wsrc = wdst;
1851     vlen_enc = Assembler::AVX_128bit;
1852   }
1853   if (is_dst_valid) {
1854     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1855   }
1856 }
1857 
1858 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1859                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1860                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1861   XMMRegister wsrc = src;
1862   XMMRegister wdst = xmm_0;
1863   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1864   int vlen_enc = Assembler::AVX_128bit;
1865   if (vlen == 8) {
1866     vlen_enc = Assembler::AVX_256bit;
1867   }
1868   for (int i = log2(vlen) - 1; i >=0; i--) {
1869     if (i == 0 && !is_dst_valid) {
1870       wdst = dst;
1871     }
1872     if (i == 1) {
1873       vextracti128_high(wtmp, wsrc);
1874     } else if (i == 2) {
1875       vextracti64x4_high(wtmp, wsrc);
1876     } else {
1877       assert(i == 0, "%d", i);
1878       vpermilpd(wtmp, wsrc, 1, vlen_enc);
1879     }
1880     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1881     wsrc = wdst;
1882     vlen_enc = Assembler::AVX_128bit;
1883   }
1884   if (is_dst_valid) {
1885     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1886   }
1887 }
1888 
1889 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
1890   switch (bt) {
1891     case T_BYTE:  pextrb(dst, src, idx); break;
1892     case T_SHORT: pextrw(dst, src, idx); break;
1893     case T_INT:   pextrd(dst, src, idx); break;
1894     case T_LONG:  pextrq(dst, src, idx); break;
1895 
1896     default:
1897       assert(false,"Should not reach here.");
1898       break;
1899   }
1900 }
1901 
1902 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
1903   int esize =  type2aelembytes(typ);
1904   int elem_per_lane = 16/esize;
1905   int lane = elemindex / elem_per_lane;
1906   int eindex = elemindex % elem_per_lane;
1907 
1908   if (lane >= 2) {
1909     assert(UseAVX > 2, "required");
1910     vextractf32x4(dst, src, lane & 3);
1911     return dst;
1912   } else if (lane > 0) {
1913     assert(UseAVX > 0, "required");
1914     vextractf128(dst, src, lane);
1915     return dst;
1916   } else {
1917     return src;
1918   }
1919 }
1920 
1921 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
1922   int esize =  type2aelembytes(typ);
1923   int elem_per_lane = 16/esize;
1924   int eindex = elemindex % elem_per_lane;
1925   assert(is_integral_type(typ),"required");
1926 
1927   if (eindex == 0) {
1928     if (typ == T_LONG) {
1929       movq(dst, src);
1930     } else {
1931       movdl(dst, src);
1932       if (typ == T_BYTE)
1933         movsbl(dst, dst);
1934       else if (typ == T_SHORT)
1935         movswl(dst, dst);
1936     }
1937   } else {
1938     extract(typ, dst, src, eindex);
1939   }
1940 }
1941 
1942 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
1943   int esize =  type2aelembytes(typ);
1944   int elem_per_lane = 16/esize;
1945   int eindex = elemindex % elem_per_lane;
1946   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
1947 
1948   if (eindex == 0) {
1949     movq(dst, src);
1950   } else {
1951     if (typ == T_FLOAT) {
1952       if (UseAVX == 0) {
1953         movdqu(dst, src);
1954         pshufps(dst, dst, eindex);
1955       } else {
1956         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
1957       }
1958     } else {
1959       if (UseAVX == 0) {
1960         movdqu(dst, src);
1961         psrldq(dst, eindex*esize);
1962       } else {
1963         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
1964       }
1965       movq(dst, dst);
1966     }
1967   }
1968   // Zero upper bits
1969   if (typ == T_FLOAT) {
1970     if (UseAVX == 0) {
1971       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
1972       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
1973       pand(dst, vtmp);
1974     } else {
1975       assert((tmp != noreg), "required.");
1976       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
1977     }
1978   }
1979 }
1980 
1981 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
1982   switch(typ) {
1983     case T_BYTE:
1984       evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1985       break;
1986     case T_SHORT:
1987       evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1988       break;
1989     case T_INT:
1990     case T_FLOAT:
1991       evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1992       break;
1993     case T_LONG:
1994     case T_DOUBLE:
1995       evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
1996       break;
1997     default:
1998       assert(false,"Should not reach here.");
1999       break;
2000   }
2001 }
2002 
2003 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2004   switch(typ) {
2005     case T_BYTE:
2006       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2007       break;
2008     case T_SHORT:
2009       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2010       break;
2011     case T_INT:
2012     case T_FLOAT:
2013       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2014       break;
2015     case T_LONG:
2016     case T_DOUBLE:
2017       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2018       break;
2019     default:
2020       assert(false,"Should not reach here.");
2021       break;
2022   }
2023 }
2024 
2025 //-------------------------------------------------------------------------------------------
2026 
2027 // IndexOf for constant substrings with size >= 8 chars
2028 // which don't need to be loaded through stack.
2029 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2030                                          Register cnt1, Register cnt2,
2031                                          int int_cnt2,  Register result,
2032                                          XMMRegister vec, Register tmp,
2033                                          int ae) {
2034   ShortBranchVerifier sbv(this);
2035   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2036   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2037 
2038   // This method uses the pcmpestri instruction with bound registers
2039   //   inputs:
2040   //     xmm - substring
2041   //     rax - substring length (elements count)
2042   //     mem - scanned string
2043   //     rdx - string length (elements count)
2044   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2045   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2046   //   outputs:
2047   //     rcx - matched index in string
2048   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2049   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2050   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2051   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2052   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2053 
2054   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2055         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2056         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2057 
2058   // Note, inline_string_indexOf() generates checks:
2059   // if (substr.count > string.count) return -1;
2060   // if (substr.count == 0) return 0;
2061   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2062 
2063   // Load substring.
2064   if (ae == StrIntrinsicNode::UL) {
2065     pmovzxbw(vec, Address(str2, 0));
2066   } else {
2067     movdqu(vec, Address(str2, 0));
2068   }
2069   movl(cnt2, int_cnt2);
2070   movptr(result, str1); // string addr
2071 
2072   if (int_cnt2 > stride) {
2073     jmpb(SCAN_TO_SUBSTR);
2074 
2075     // Reload substr for rescan, this code
2076     // is executed only for large substrings (> 8 chars)
2077     bind(RELOAD_SUBSTR);
2078     if (ae == StrIntrinsicNode::UL) {
2079       pmovzxbw(vec, Address(str2, 0));
2080     } else {
2081       movdqu(vec, Address(str2, 0));
2082     }
2083     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2084 
2085     bind(RELOAD_STR);
2086     // We came here after the beginning of the substring was
2087     // matched but the rest of it was not so we need to search
2088     // again. Start from the next element after the previous match.
2089 
2090     // cnt2 is number of substring reminding elements and
2091     // cnt1 is number of string reminding elements when cmp failed.
2092     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2093     subl(cnt1, cnt2);
2094     addl(cnt1, int_cnt2);
2095     movl(cnt2, int_cnt2); // Now restore cnt2
2096 
2097     decrementl(cnt1);     // Shift to next element
2098     cmpl(cnt1, cnt2);
2099     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2100 
2101     addptr(result, (1<<scale1));
2102 
2103   } // (int_cnt2 > 8)
2104 
2105   // Scan string for start of substr in 16-byte vectors
2106   bind(SCAN_TO_SUBSTR);
2107   pcmpestri(vec, Address(result, 0), mode);
2108   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2109   subl(cnt1, stride);
2110   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2111   cmpl(cnt1, cnt2);
2112   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2113   addptr(result, 16);
2114   jmpb(SCAN_TO_SUBSTR);
2115 
2116   // Found a potential substr
2117   bind(FOUND_CANDIDATE);
2118   // Matched whole vector if first element matched (tmp(rcx) == 0).
2119   if (int_cnt2 == stride) {
2120     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2121   } else { // int_cnt2 > 8
2122     jccb(Assembler::overflow, FOUND_SUBSTR);
2123   }
2124   // After pcmpestri tmp(rcx) contains matched element index
2125   // Compute start addr of substr
2126   lea(result, Address(result, tmp, scale1));
2127 
2128   // Make sure string is still long enough
2129   subl(cnt1, tmp);
2130   cmpl(cnt1, cnt2);
2131   if (int_cnt2 == stride) {
2132     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2133   } else { // int_cnt2 > 8
2134     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2135   }
2136   // Left less then substring.
2137 
2138   bind(RET_NOT_FOUND);
2139   movl(result, -1);
2140   jmp(EXIT);
2141 
2142   if (int_cnt2 > stride) {
2143     // This code is optimized for the case when whole substring
2144     // is matched if its head is matched.
2145     bind(MATCH_SUBSTR_HEAD);
2146     pcmpestri(vec, Address(result, 0), mode);
2147     // Reload only string if does not match
2148     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2149 
2150     Label CONT_SCAN_SUBSTR;
2151     // Compare the rest of substring (> 8 chars).
2152     bind(FOUND_SUBSTR);
2153     // First 8 chars are already matched.
2154     negptr(cnt2);
2155     addptr(cnt2, stride);
2156 
2157     bind(SCAN_SUBSTR);
2158     subl(cnt1, stride);
2159     cmpl(cnt2, -stride); // Do not read beyond substring
2160     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2161     // Back-up strings to avoid reading beyond substring:
2162     // cnt1 = cnt1 - cnt2 + 8
2163     addl(cnt1, cnt2); // cnt2 is negative
2164     addl(cnt1, stride);
2165     movl(cnt2, stride); negptr(cnt2);
2166     bind(CONT_SCAN_SUBSTR);
2167     if (int_cnt2 < (int)G) {
2168       int tail_off1 = int_cnt2<<scale1;
2169       int tail_off2 = int_cnt2<<scale2;
2170       if (ae == StrIntrinsicNode::UL) {
2171         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2172       } else {
2173         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2174       }
2175       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2176     } else {
2177       // calculate index in register to avoid integer overflow (int_cnt2*2)
2178       movl(tmp, int_cnt2);
2179       addptr(tmp, cnt2);
2180       if (ae == StrIntrinsicNode::UL) {
2181         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2182       } else {
2183         movdqu(vec, Address(str2, tmp, scale2, 0));
2184       }
2185       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2186     }
2187     // Need to reload strings pointers if not matched whole vector
2188     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2189     addptr(cnt2, stride);
2190     jcc(Assembler::negative, SCAN_SUBSTR);
2191     // Fall through if found full substring
2192 
2193   } // (int_cnt2 > 8)
2194 
2195   bind(RET_FOUND);
2196   // Found result if we matched full small substring.
2197   // Compute substr offset
2198   subptr(result, str1);
2199   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2200     shrl(result, 1); // index
2201   }
2202   bind(EXIT);
2203 
2204 } // string_indexofC8
2205 
2206 // Small strings are loaded through stack if they cross page boundary.
2207 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2208                                        Register cnt1, Register cnt2,
2209                                        int int_cnt2,  Register result,
2210                                        XMMRegister vec, Register tmp,
2211                                        int ae) {
2212   ShortBranchVerifier sbv(this);
2213   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2214   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2215 
2216   //
2217   // int_cnt2 is length of small (< 8 chars) constant substring
2218   // or (-1) for non constant substring in which case its length
2219   // is in cnt2 register.
2220   //
2221   // Note, inline_string_indexOf() generates checks:
2222   // if (substr.count > string.count) return -1;
2223   // if (substr.count == 0) return 0;
2224   //
2225   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2226   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2227   // This method uses the pcmpestri instruction with bound registers
2228   //   inputs:
2229   //     xmm - substring
2230   //     rax - substring length (elements count)
2231   //     mem - scanned string
2232   //     rdx - string length (elements count)
2233   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2234   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2235   //   outputs:
2236   //     rcx - matched index in string
2237   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2238   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2239   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2240   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2241 
2242   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2243         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2244         FOUND_CANDIDATE;
2245 
2246   { //========================================================
2247     // We don't know where these strings are located
2248     // and we can't read beyond them. Load them through stack.
2249     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2250 
2251     movptr(tmp, rsp); // save old SP
2252 
2253     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2254       if (int_cnt2 == (1>>scale2)) { // One byte
2255         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2256         load_unsigned_byte(result, Address(str2, 0));
2257         movdl(vec, result); // move 32 bits
2258       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2259         // Not enough header space in 32-bit VM: 12+3 = 15.
2260         movl(result, Address(str2, -1));
2261         shrl(result, 8);
2262         movdl(vec, result); // move 32 bits
2263       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2264         load_unsigned_short(result, Address(str2, 0));
2265         movdl(vec, result); // move 32 bits
2266       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2267         movdl(vec, Address(str2, 0)); // move 32 bits
2268       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2269         movq(vec, Address(str2, 0));  // move 64 bits
2270       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2271         // Array header size is 12 bytes in 32-bit VM
2272         // + 6 bytes for 3 chars == 18 bytes,
2273         // enough space to load vec and shift.
2274         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2275         if (ae == StrIntrinsicNode::UL) {
2276           int tail_off = int_cnt2-8;
2277           pmovzxbw(vec, Address(str2, tail_off));
2278           psrldq(vec, -2*tail_off);
2279         }
2280         else {
2281           int tail_off = int_cnt2*(1<<scale2);
2282           movdqu(vec, Address(str2, tail_off-16));
2283           psrldq(vec, 16-tail_off);
2284         }
2285       }
2286     } else { // not constant substring
2287       cmpl(cnt2, stride);
2288       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2289 
2290       // We can read beyond string if srt+16 does not cross page boundary
2291       // since heaps are aligned and mapped by pages.
2292       assert(os::vm_page_size() < (int)G, "default page should be small");
2293       movl(result, str2); // We need only low 32 bits
2294       andl(result, (os::vm_page_size()-1));
2295       cmpl(result, (os::vm_page_size()-16));
2296       jccb(Assembler::belowEqual, CHECK_STR);
2297 
2298       // Move small strings to stack to allow load 16 bytes into vec.
2299       subptr(rsp, 16);
2300       int stk_offset = wordSize-(1<<scale2);
2301       push(cnt2);
2302 
2303       bind(COPY_SUBSTR);
2304       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2305         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2306         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2307       } else if (ae == StrIntrinsicNode::UU) {
2308         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2309         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2310       }
2311       decrement(cnt2);
2312       jccb(Assembler::notZero, COPY_SUBSTR);
2313 
2314       pop(cnt2);
2315       movptr(str2, rsp);  // New substring address
2316     } // non constant
2317 
2318     bind(CHECK_STR);
2319     cmpl(cnt1, stride);
2320     jccb(Assembler::aboveEqual, BIG_STRINGS);
2321 
2322     // Check cross page boundary.
2323     movl(result, str1); // We need only low 32 bits
2324     andl(result, (os::vm_page_size()-1));
2325     cmpl(result, (os::vm_page_size()-16));
2326     jccb(Assembler::belowEqual, BIG_STRINGS);
2327 
2328     subptr(rsp, 16);
2329     int stk_offset = -(1<<scale1);
2330     if (int_cnt2 < 0) { // not constant
2331       push(cnt2);
2332       stk_offset += wordSize;
2333     }
2334     movl(cnt2, cnt1);
2335 
2336     bind(COPY_STR);
2337     if (ae == StrIntrinsicNode::LL) {
2338       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2339       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2340     } else {
2341       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2342       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2343     }
2344     decrement(cnt2);
2345     jccb(Assembler::notZero, COPY_STR);
2346 
2347     if (int_cnt2 < 0) { // not constant
2348       pop(cnt2);
2349     }
2350     movptr(str1, rsp);  // New string address
2351 
2352     bind(BIG_STRINGS);
2353     // Load substring.
2354     if (int_cnt2 < 0) { // -1
2355       if (ae == StrIntrinsicNode::UL) {
2356         pmovzxbw(vec, Address(str2, 0));
2357       } else {
2358         movdqu(vec, Address(str2, 0));
2359       }
2360       push(cnt2);       // substr count
2361       push(str2);       // substr addr
2362       push(str1);       // string addr
2363     } else {
2364       // Small (< 8 chars) constant substrings are loaded already.
2365       movl(cnt2, int_cnt2);
2366     }
2367     push(tmp);  // original SP
2368 
2369   } // Finished loading
2370 
2371   //========================================================
2372   // Start search
2373   //
2374 
2375   movptr(result, str1); // string addr
2376 
2377   if (int_cnt2  < 0) {  // Only for non constant substring
2378     jmpb(SCAN_TO_SUBSTR);
2379 
2380     // SP saved at sp+0
2381     // String saved at sp+1*wordSize
2382     // Substr saved at sp+2*wordSize
2383     // Substr count saved at sp+3*wordSize
2384 
2385     // Reload substr for rescan, this code
2386     // is executed only for large substrings (> 8 chars)
2387     bind(RELOAD_SUBSTR);
2388     movptr(str2, Address(rsp, 2*wordSize));
2389     movl(cnt2, Address(rsp, 3*wordSize));
2390     if (ae == StrIntrinsicNode::UL) {
2391       pmovzxbw(vec, Address(str2, 0));
2392     } else {
2393       movdqu(vec, Address(str2, 0));
2394     }
2395     // We came here after the beginning of the substring was
2396     // matched but the rest of it was not so we need to search
2397     // again. Start from the next element after the previous match.
2398     subptr(str1, result); // Restore counter
2399     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2400       shrl(str1, 1);
2401     }
2402     addl(cnt1, str1);
2403     decrementl(cnt1);   // Shift to next element
2404     cmpl(cnt1, cnt2);
2405     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2406 
2407     addptr(result, (1<<scale1));
2408   } // non constant
2409 
2410   // Scan string for start of substr in 16-byte vectors
2411   bind(SCAN_TO_SUBSTR);
2412   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2413   pcmpestri(vec, Address(result, 0), mode);
2414   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2415   subl(cnt1, stride);
2416   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2417   cmpl(cnt1, cnt2);
2418   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2419   addptr(result, 16);
2420 
2421   bind(ADJUST_STR);
2422   cmpl(cnt1, stride); // Do not read beyond string
2423   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2424   // Back-up string to avoid reading beyond string.
2425   lea(result, Address(result, cnt1, scale1, -16));
2426   movl(cnt1, stride);
2427   jmpb(SCAN_TO_SUBSTR);
2428 
2429   // Found a potential substr
2430   bind(FOUND_CANDIDATE);
2431   // After pcmpestri tmp(rcx) contains matched element index
2432 
2433   // Make sure string is still long enough
2434   subl(cnt1, tmp);
2435   cmpl(cnt1, cnt2);
2436   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2437   // Left less then substring.
2438 
2439   bind(RET_NOT_FOUND);
2440   movl(result, -1);
2441   jmp(CLEANUP);
2442 
2443   bind(FOUND_SUBSTR);
2444   // Compute start addr of substr
2445   lea(result, Address(result, tmp, scale1));
2446   if (int_cnt2 > 0) { // Constant substring
2447     // Repeat search for small substring (< 8 chars)
2448     // from new point without reloading substring.
2449     // Have to check that we don't read beyond string.
2450     cmpl(tmp, stride-int_cnt2);
2451     jccb(Assembler::greater, ADJUST_STR);
2452     // Fall through if matched whole substring.
2453   } else { // non constant
2454     assert(int_cnt2 == -1, "should be != 0");
2455 
2456     addl(tmp, cnt2);
2457     // Found result if we matched whole substring.
2458     cmpl(tmp, stride);
2459     jcc(Assembler::lessEqual, RET_FOUND);
2460 
2461     // Repeat search for small substring (<= 8 chars)
2462     // from new point 'str1' without reloading substring.
2463     cmpl(cnt2, stride);
2464     // Have to check that we don't read beyond string.
2465     jccb(Assembler::lessEqual, ADJUST_STR);
2466 
2467     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2468     // Compare the rest of substring (> 8 chars).
2469     movptr(str1, result);
2470 
2471     cmpl(tmp, cnt2);
2472     // First 8 chars are already matched.
2473     jccb(Assembler::equal, CHECK_NEXT);
2474 
2475     bind(SCAN_SUBSTR);
2476     pcmpestri(vec, Address(str1, 0), mode);
2477     // Need to reload strings pointers if not matched whole vector
2478     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2479 
2480     bind(CHECK_NEXT);
2481     subl(cnt2, stride);
2482     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2483     addptr(str1, 16);
2484     if (ae == StrIntrinsicNode::UL) {
2485       addptr(str2, 8);
2486     } else {
2487       addptr(str2, 16);
2488     }
2489     subl(cnt1, stride);
2490     cmpl(cnt2, stride); // Do not read beyond substring
2491     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2492     // Back-up strings to avoid reading beyond substring.
2493 
2494     if (ae == StrIntrinsicNode::UL) {
2495       lea(str2, Address(str2, cnt2, scale2, -8));
2496       lea(str1, Address(str1, cnt2, scale1, -16));
2497     } else {
2498       lea(str2, Address(str2, cnt2, scale2, -16));
2499       lea(str1, Address(str1, cnt2, scale1, -16));
2500     }
2501     subl(cnt1, cnt2);
2502     movl(cnt2, stride);
2503     addl(cnt1, stride);
2504     bind(CONT_SCAN_SUBSTR);
2505     if (ae == StrIntrinsicNode::UL) {
2506       pmovzxbw(vec, Address(str2, 0));
2507     } else {
2508       movdqu(vec, Address(str2, 0));
2509     }
2510     jmp(SCAN_SUBSTR);
2511 
2512     bind(RET_FOUND_LONG);
2513     movptr(str1, Address(rsp, wordSize));
2514   } // non constant
2515 
2516   bind(RET_FOUND);
2517   // Compute substr offset
2518   subptr(result, str1);
2519   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2520     shrl(result, 1); // index
2521   }
2522   bind(CLEANUP);
2523   pop(rsp); // restore SP
2524 
2525 } // string_indexof
2526 
2527 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2528                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2529   ShortBranchVerifier sbv(this);
2530   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2531 
2532   int stride = 8;
2533 
2534   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2535         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2536         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2537         FOUND_SEQ_CHAR, DONE_LABEL;
2538 
2539   movptr(result, str1);
2540   if (UseAVX >= 2) {
2541     cmpl(cnt1, stride);
2542     jcc(Assembler::less, SCAN_TO_CHAR);
2543     cmpl(cnt1, 2*stride);
2544     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2545     movdl(vec1, ch);
2546     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2547     vpxor(vec2, vec2);
2548     movl(tmp, cnt1);
2549     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2550     andl(cnt1,0x0000000F);  //tail count (in chars)
2551 
2552     bind(SCAN_TO_16_CHAR_LOOP);
2553     vmovdqu(vec3, Address(result, 0));
2554     vpcmpeqw(vec3, vec3, vec1, 1);
2555     vptest(vec2, vec3);
2556     jcc(Assembler::carryClear, FOUND_CHAR);
2557     addptr(result, 32);
2558     subl(tmp, 2*stride);
2559     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2560     jmp(SCAN_TO_8_CHAR);
2561     bind(SCAN_TO_8_CHAR_INIT);
2562     movdl(vec1, ch);
2563     pshuflw(vec1, vec1, 0x00);
2564     pshufd(vec1, vec1, 0);
2565     pxor(vec2, vec2);
2566   }
2567   bind(SCAN_TO_8_CHAR);
2568   cmpl(cnt1, stride);
2569   jcc(Assembler::less, SCAN_TO_CHAR);
2570   if (UseAVX < 2) {
2571     movdl(vec1, ch);
2572     pshuflw(vec1, vec1, 0x00);
2573     pshufd(vec1, vec1, 0);
2574     pxor(vec2, vec2);
2575   }
2576   movl(tmp, cnt1);
2577   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2578   andl(cnt1,0x00000007);  //tail count (in chars)
2579 
2580   bind(SCAN_TO_8_CHAR_LOOP);
2581   movdqu(vec3, Address(result, 0));
2582   pcmpeqw(vec3, vec1);
2583   ptest(vec2, vec3);
2584   jcc(Assembler::carryClear, FOUND_CHAR);
2585   addptr(result, 16);
2586   subl(tmp, stride);
2587   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2588   bind(SCAN_TO_CHAR);
2589   testl(cnt1, cnt1);
2590   jcc(Assembler::zero, RET_NOT_FOUND);
2591   bind(SCAN_TO_CHAR_LOOP);
2592   load_unsigned_short(tmp, Address(result, 0));
2593   cmpl(ch, tmp);
2594   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2595   addptr(result, 2);
2596   subl(cnt1, 1);
2597   jccb(Assembler::zero, RET_NOT_FOUND);
2598   jmp(SCAN_TO_CHAR_LOOP);
2599 
2600   bind(RET_NOT_FOUND);
2601   movl(result, -1);
2602   jmpb(DONE_LABEL);
2603 
2604   bind(FOUND_CHAR);
2605   if (UseAVX >= 2) {
2606     vpmovmskb(tmp, vec3);
2607   } else {
2608     pmovmskb(tmp, vec3);
2609   }
2610   bsfl(ch, tmp);
2611   addl(result, ch);
2612 
2613   bind(FOUND_SEQ_CHAR);
2614   subptr(result, str1);
2615   shrl(result, 1);
2616 
2617   bind(DONE_LABEL);
2618 } // string_indexof_char
2619 
2620 // helper function for string_compare
2621 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2622                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2623                                            Address::ScaleFactor scale2, Register index, int ae) {
2624   if (ae == StrIntrinsicNode::LL) {
2625     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2626     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2627   } else if (ae == StrIntrinsicNode::UU) {
2628     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2629     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2630   } else {
2631     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2632     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2633   }
2634 }
2635 
2636 // Compare strings, used for char[] and byte[].
2637 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2638                                        Register cnt1, Register cnt2, Register result,
2639                                        XMMRegister vec1, int ae) {
2640   ShortBranchVerifier sbv(this);
2641   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2642   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
2643   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2644   int stride2x2 = 0x40;
2645   Address::ScaleFactor scale = Address::no_scale;
2646   Address::ScaleFactor scale1 = Address::no_scale;
2647   Address::ScaleFactor scale2 = Address::no_scale;
2648 
2649   if (ae != StrIntrinsicNode::LL) {
2650     stride2x2 = 0x20;
2651   }
2652 
2653   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2654     shrl(cnt2, 1);
2655   }
2656   // Compute the minimum of the string lengths and the
2657   // difference of the string lengths (stack).
2658   // Do the conditional move stuff
2659   movl(result, cnt1);
2660   subl(cnt1, cnt2);
2661   push(cnt1);
2662   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
2663 
2664   // Is the minimum length zero?
2665   testl(cnt2, cnt2);
2666   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2667   if (ae == StrIntrinsicNode::LL) {
2668     // Load first bytes
2669     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
2670     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
2671   } else if (ae == StrIntrinsicNode::UU) {
2672     // Load first characters
2673     load_unsigned_short(result, Address(str1, 0));
2674     load_unsigned_short(cnt1, Address(str2, 0));
2675   } else {
2676     load_unsigned_byte(result, Address(str1, 0));
2677     load_unsigned_short(cnt1, Address(str2, 0));
2678   }
2679   subl(result, cnt1);
2680   jcc(Assembler::notZero,  POP_LABEL);
2681 
2682   if (ae == StrIntrinsicNode::UU) {
2683     // Divide length by 2 to get number of chars
2684     shrl(cnt2, 1);
2685   }
2686   cmpl(cnt2, 1);
2687   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2688 
2689   // Check if the strings start at the same location and setup scale and stride
2690   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2691     cmpptr(str1, str2);
2692     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2693     if (ae == StrIntrinsicNode::LL) {
2694       scale = Address::times_1;
2695       stride = 16;
2696     } else {
2697       scale = Address::times_2;
2698       stride = 8;
2699     }
2700   } else {
2701     scale1 = Address::times_1;
2702     scale2 = Address::times_2;
2703     // scale not used
2704     stride = 8;
2705   }
2706 
2707   if (UseAVX >= 2 && UseSSE42Intrinsics) {
2708     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
2709     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
2710     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
2711     Label COMPARE_TAIL_LONG;
2712     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
2713 
2714     int pcmpmask = 0x19;
2715     if (ae == StrIntrinsicNode::LL) {
2716       pcmpmask &= ~0x01;
2717     }
2718 
2719     // Setup to compare 16-chars (32-bytes) vectors,
2720     // start from first character again because it has aligned address.
2721     if (ae == StrIntrinsicNode::LL) {
2722       stride2 = 32;
2723     } else {
2724       stride2 = 16;
2725     }
2726     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2727       adr_stride = stride << scale;
2728     } else {
2729       adr_stride1 = 8;  //stride << scale1;
2730       adr_stride2 = 16; //stride << scale2;
2731     }
2732 
2733     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2734     // rax and rdx are used by pcmpestri as elements counters
2735     movl(result, cnt2);
2736     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
2737     jcc(Assembler::zero, COMPARE_TAIL_LONG);
2738 
2739     // fast path : compare first 2 8-char vectors.
2740     bind(COMPARE_16_CHARS);
2741     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2742       movdqu(vec1, Address(str1, 0));
2743     } else {
2744       pmovzxbw(vec1, Address(str1, 0));
2745     }
2746     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2747     jccb(Assembler::below, COMPARE_INDEX_CHAR);
2748 
2749     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2750       movdqu(vec1, Address(str1, adr_stride));
2751       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
2752     } else {
2753       pmovzxbw(vec1, Address(str1, adr_stride1));
2754       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
2755     }
2756     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
2757     addl(cnt1, stride);
2758 
2759     // Compare the characters at index in cnt1
2760     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
2761     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2762     subl(result, cnt2);
2763     jmp(POP_LABEL);
2764 
2765     // Setup the registers to start vector comparison loop
2766     bind(COMPARE_WIDE_VECTORS);
2767     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2768       lea(str1, Address(str1, result, scale));
2769       lea(str2, Address(str2, result, scale));
2770     } else {
2771       lea(str1, Address(str1, result, scale1));
2772       lea(str2, Address(str2, result, scale2));
2773     }
2774     subl(result, stride2);
2775     subl(cnt2, stride2);
2776     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
2777     negptr(result);
2778 
2779     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
2780     bind(COMPARE_WIDE_VECTORS_LOOP);
2781 
2782 #ifdef _LP64
2783     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2784       cmpl(cnt2, stride2x2);
2785       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2786       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
2787       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
2788 
2789       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2790       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2791         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
2792         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2793       } else {
2794         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
2795         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2796       }
2797       kortestql(k7, k7);
2798       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
2799       addptr(result, stride2x2);  // update since we already compared at this addr
2800       subl(cnt2, stride2x2);      // and sub the size too
2801       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2802 
2803       vpxor(vec1, vec1);
2804       jmpb(COMPARE_WIDE_TAIL);
2805     }//if (VM_Version::supports_avx512vlbw())
2806 #endif // _LP64
2807 
2808 
2809     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2810     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2811       vmovdqu(vec1, Address(str1, result, scale));
2812       vpxor(vec1, Address(str2, result, scale));
2813     } else {
2814       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
2815       vpxor(vec1, Address(str2, result, scale2));
2816     }
2817     vptest(vec1, vec1);
2818     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
2819     addptr(result, stride2);
2820     subl(cnt2, stride2);
2821     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
2822     // clean upper bits of YMM registers
2823     vpxor(vec1, vec1);
2824 
2825     // compare wide vectors tail
2826     bind(COMPARE_WIDE_TAIL);
2827     testptr(result, result);
2828     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2829 
2830     movl(result, stride2);
2831     movl(cnt2, result);
2832     negptr(result);
2833     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2834 
2835     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
2836     bind(VECTOR_NOT_EQUAL);
2837     // clean upper bits of YMM registers
2838     vpxor(vec1, vec1);
2839     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2840       lea(str1, Address(str1, result, scale));
2841       lea(str2, Address(str2, result, scale));
2842     } else {
2843       lea(str1, Address(str1, result, scale1));
2844       lea(str2, Address(str2, result, scale2));
2845     }
2846     jmp(COMPARE_16_CHARS);
2847 
2848     // Compare tail chars, length between 1 to 15 chars
2849     bind(COMPARE_TAIL_LONG);
2850     movl(cnt2, result);
2851     cmpl(cnt2, stride);
2852     jcc(Assembler::less, COMPARE_SMALL_STR);
2853 
2854     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2855       movdqu(vec1, Address(str1, 0));
2856     } else {
2857       pmovzxbw(vec1, Address(str1, 0));
2858     }
2859     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2860     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2861     subptr(cnt2, stride);
2862     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2863     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2864       lea(str1, Address(str1, result, scale));
2865       lea(str2, Address(str2, result, scale));
2866     } else {
2867       lea(str1, Address(str1, result, scale1));
2868       lea(str2, Address(str2, result, scale2));
2869     }
2870     negptr(cnt2);
2871     jmpb(WHILE_HEAD_LABEL);
2872 
2873     bind(COMPARE_SMALL_STR);
2874   } else if (UseSSE42Intrinsics) {
2875     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2876     int pcmpmask = 0x19;
2877     // Setup to compare 8-char (16-byte) vectors,
2878     // start from first character again because it has aligned address.
2879     movl(result, cnt2);
2880     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2881     if (ae == StrIntrinsicNode::LL) {
2882       pcmpmask &= ~0x01;
2883     }
2884     jcc(Assembler::zero, COMPARE_TAIL);
2885     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2886       lea(str1, Address(str1, result, scale));
2887       lea(str2, Address(str2, result, scale));
2888     } else {
2889       lea(str1, Address(str1, result, scale1));
2890       lea(str2, Address(str2, result, scale2));
2891     }
2892     negptr(result);
2893 
2894     // pcmpestri
2895     //   inputs:
2896     //     vec1- substring
2897     //     rax - negative string length (elements count)
2898     //     mem - scanned string
2899     //     rdx - string length (elements count)
2900     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2901     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2902     //   outputs:
2903     //     rcx - first mismatched element index
2904     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2905 
2906     bind(COMPARE_WIDE_VECTORS);
2907     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2908       movdqu(vec1, Address(str1, result, scale));
2909       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2910     } else {
2911       pmovzxbw(vec1, Address(str1, result, scale1));
2912       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2913     }
2914     // After pcmpestri cnt1(rcx) contains mismatched element index
2915 
2916     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2917     addptr(result, stride);
2918     subptr(cnt2, stride);
2919     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2920 
2921     // compare wide vectors tail
2922     testptr(result, result);
2923     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2924 
2925     movl(cnt2, stride);
2926     movl(result, stride);
2927     negptr(result);
2928     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2929       movdqu(vec1, Address(str1, result, scale));
2930       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2931     } else {
2932       pmovzxbw(vec1, Address(str1, result, scale1));
2933       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2934     }
2935     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2936 
2937     // Mismatched characters in the vectors
2938     bind(VECTOR_NOT_EQUAL);
2939     addptr(cnt1, result);
2940     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2941     subl(result, cnt2);
2942     jmpb(POP_LABEL);
2943 
2944     bind(COMPARE_TAIL); // limit is zero
2945     movl(cnt2, result);
2946     // Fallthru to tail compare
2947   }
2948   // Shift str2 and str1 to the end of the arrays, negate min
2949   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2950     lea(str1, Address(str1, cnt2, scale));
2951     lea(str2, Address(str2, cnt2, scale));
2952   } else {
2953     lea(str1, Address(str1, cnt2, scale1));
2954     lea(str2, Address(str2, cnt2, scale2));
2955   }
2956   decrementl(cnt2);  // first character was compared already
2957   negptr(cnt2);
2958 
2959   // Compare the rest of the elements
2960   bind(WHILE_HEAD_LABEL);
2961   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2962   subl(result, cnt1);
2963   jccb(Assembler::notZero, POP_LABEL);
2964   increment(cnt2);
2965   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2966 
2967   // Strings are equal up to min length.  Return the length difference.
2968   bind(LENGTH_DIFF_LABEL);
2969   pop(result);
2970   if (ae == StrIntrinsicNode::UU) {
2971     // Divide diff by 2 to get number of chars
2972     sarl(result, 1);
2973   }
2974   jmpb(DONE_LABEL);
2975 
2976 #ifdef _LP64
2977   if (VM_Version::supports_avx512vlbw()) {
2978 
2979     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2980 
2981     kmovql(cnt1, k7);
2982     notq(cnt1);
2983     bsfq(cnt2, cnt1);
2984     if (ae != StrIntrinsicNode::LL) {
2985       // Divide diff by 2 to get number of chars
2986       sarl(cnt2, 1);
2987     }
2988     addq(result, cnt2);
2989     if (ae == StrIntrinsicNode::LL) {
2990       load_unsigned_byte(cnt1, Address(str2, result));
2991       load_unsigned_byte(result, Address(str1, result));
2992     } else if (ae == StrIntrinsicNode::UU) {
2993       load_unsigned_short(cnt1, Address(str2, result, scale));
2994       load_unsigned_short(result, Address(str1, result, scale));
2995     } else {
2996       load_unsigned_short(cnt1, Address(str2, result, scale2));
2997       load_unsigned_byte(result, Address(str1, result, scale1));
2998     }
2999     subl(result, cnt1);
3000     jmpb(POP_LABEL);
3001   }//if (VM_Version::supports_avx512vlbw())
3002 #endif // _LP64
3003 
3004   // Discard the stored length difference
3005   bind(POP_LABEL);
3006   pop(cnt1);
3007 
3008   // That's it
3009   bind(DONE_LABEL);
3010   if(ae == StrIntrinsicNode::UL) {
3011     negl(result);
3012   }
3013 
3014 }
3015 
3016 // Search for Non-ASCII character (Negative byte value) in a byte array,
3017 // return true if it has any and false otherwise.
3018 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3019 //   @HotSpotIntrinsicCandidate
3020 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3021 //     for (int i = off; i < off + len; i++) {
3022 //       if (ba[i] < 0) {
3023 //         return true;
3024 //       }
3025 //     }
3026 //     return false;
3027 //   }
3028 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3029   Register result, Register tmp1,
3030   XMMRegister vec1, XMMRegister vec2) {
3031   // rsi: byte array
3032   // rcx: len
3033   // rax: result
3034   ShortBranchVerifier sbv(this);
3035   assert_different_registers(ary1, len, result, tmp1);
3036   assert_different_registers(vec1, vec2);
3037   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3038 
3039   // len == 0
3040   testl(len, len);
3041   jcc(Assembler::zero, FALSE_LABEL);
3042 
3043   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3044     VM_Version::supports_avx512vlbw() &&
3045     VM_Version::supports_bmi2()) {
3046 
3047     Label test_64_loop, test_tail;
3048     Register tmp3_aliased = len;
3049 
3050     movl(tmp1, len);
3051     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3052 
3053     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3054     andl(len, ~(64 - 1));    // vector count (in chars)
3055     jccb(Assembler::zero, test_tail);
3056 
3057     lea(ary1, Address(ary1, len, Address::times_1));
3058     negptr(len);
3059 
3060     bind(test_64_loop);
3061     // Check whether our 64 elements of size byte contain negatives
3062     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3063     kortestql(k2, k2);
3064     jcc(Assembler::notZero, TRUE_LABEL);
3065 
3066     addptr(len, 64);
3067     jccb(Assembler::notZero, test_64_loop);
3068 
3069 
3070     bind(test_tail);
3071     // bail out when there is nothing to be done
3072     testl(tmp1, -1);
3073     jcc(Assembler::zero, FALSE_LABEL);
3074 
3075     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3076 #ifdef _LP64
3077     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3078     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3079     notq(tmp3_aliased);
3080     kmovql(k3, tmp3_aliased);
3081 #else
3082     Label k_init;
3083     jmp(k_init);
3084 
3085     // We could not read 64-bits from a general purpose register thus we move
3086     // data required to compose 64 1's to the instruction stream
3087     // We emit 64 byte wide series of elements from 0..63 which later on would
3088     // be used as a compare targets with tail count contained in tmp1 register.
3089     // Result would be a k register having tmp1 consecutive number or 1
3090     // counting from least significant bit.
3091     address tmp = pc();
3092     emit_int64(0x0706050403020100);
3093     emit_int64(0x0F0E0D0C0B0A0908);
3094     emit_int64(0x1716151413121110);
3095     emit_int64(0x1F1E1D1C1B1A1918);
3096     emit_int64(0x2726252423222120);
3097     emit_int64(0x2F2E2D2C2B2A2928);
3098     emit_int64(0x3736353433323130);
3099     emit_int64(0x3F3E3D3C3B3A3938);
3100 
3101     bind(k_init);
3102     lea(len, InternalAddress(tmp));
3103     // create mask to test for negative byte inside a vector
3104     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3105     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
3106 
3107 #endif
3108     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3109     ktestq(k2, k3);
3110     jcc(Assembler::notZero, TRUE_LABEL);
3111 
3112     jmp(FALSE_LABEL);
3113   } else {
3114     movl(result, len); // copy
3115 
3116     if (UseAVX >= 2 && UseSSE >= 2) {
3117       // With AVX2, use 32-byte vector compare
3118       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3119 
3120       // Compare 32-byte vectors
3121       andl(result, 0x0000001f);  //   tail count (in bytes)
3122       andl(len, 0xffffffe0);   // vector count (in bytes)
3123       jccb(Assembler::zero, COMPARE_TAIL);
3124 
3125       lea(ary1, Address(ary1, len, Address::times_1));
3126       negptr(len);
3127 
3128       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3129       movdl(vec2, tmp1);
3130       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3131 
3132       bind(COMPARE_WIDE_VECTORS);
3133       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3134       vptest(vec1, vec2);
3135       jccb(Assembler::notZero, TRUE_LABEL);
3136       addptr(len, 32);
3137       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3138 
3139       testl(result, result);
3140       jccb(Assembler::zero, FALSE_LABEL);
3141 
3142       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3143       vptest(vec1, vec2);
3144       jccb(Assembler::notZero, TRUE_LABEL);
3145       jmpb(FALSE_LABEL);
3146 
3147       bind(COMPARE_TAIL); // len is zero
3148       movl(len, result);
3149       // Fallthru to tail compare
3150     } else if (UseSSE42Intrinsics) {
3151       // With SSE4.2, use double quad vector compare
3152       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3153 
3154       // Compare 16-byte vectors
3155       andl(result, 0x0000000f);  //   tail count (in bytes)
3156       andl(len, 0xfffffff0);   // vector count (in bytes)
3157       jcc(Assembler::zero, COMPARE_TAIL);
3158 
3159       lea(ary1, Address(ary1, len, Address::times_1));
3160       negptr(len);
3161 
3162       movl(tmp1, 0x80808080);
3163       movdl(vec2, tmp1);
3164       pshufd(vec2, vec2, 0);
3165 
3166       bind(COMPARE_WIDE_VECTORS);
3167       movdqu(vec1, Address(ary1, len, Address::times_1));
3168       ptest(vec1, vec2);
3169       jcc(Assembler::notZero, TRUE_LABEL);
3170       addptr(len, 16);
3171       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3172 
3173       testl(result, result);
3174       jcc(Assembler::zero, FALSE_LABEL);
3175 
3176       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3177       ptest(vec1, vec2);
3178       jccb(Assembler::notZero, TRUE_LABEL);
3179       jmpb(FALSE_LABEL);
3180 
3181       bind(COMPARE_TAIL); // len is zero
3182       movl(len, result);
3183       // Fallthru to tail compare
3184     }
3185   }
3186   // Compare 4-byte vectors
3187   andl(len, 0xfffffffc); // vector count (in bytes)
3188   jccb(Assembler::zero, COMPARE_CHAR);
3189 
3190   lea(ary1, Address(ary1, len, Address::times_1));
3191   negptr(len);
3192 
3193   bind(COMPARE_VECTORS);
3194   movl(tmp1, Address(ary1, len, Address::times_1));
3195   andl(tmp1, 0x80808080);
3196   jccb(Assembler::notZero, TRUE_LABEL);
3197   addptr(len, 4);
3198   jcc(Assembler::notZero, COMPARE_VECTORS);
3199 
3200   // Compare trailing char (final 2 bytes), if any
3201   bind(COMPARE_CHAR);
3202   testl(result, 0x2);   // tail  char
3203   jccb(Assembler::zero, COMPARE_BYTE);
3204   load_unsigned_short(tmp1, Address(ary1, 0));
3205   andl(tmp1, 0x00008080);
3206   jccb(Assembler::notZero, TRUE_LABEL);
3207   subptr(result, 2);
3208   lea(ary1, Address(ary1, 2));
3209 
3210   bind(COMPARE_BYTE);
3211   testl(result, 0x1);   // tail  byte
3212   jccb(Assembler::zero, FALSE_LABEL);
3213   load_unsigned_byte(tmp1, Address(ary1, 0));
3214   andl(tmp1, 0x00000080);
3215   jccb(Assembler::notEqual, TRUE_LABEL);
3216   jmpb(FALSE_LABEL);
3217 
3218   bind(TRUE_LABEL);
3219   movl(result, 1);   // return true
3220   jmpb(DONE);
3221 
3222   bind(FALSE_LABEL);
3223   xorl(result, result); // return false
3224 
3225   // That's it
3226   bind(DONE);
3227   if (UseAVX >= 2 && UseSSE >= 2) {
3228     // clean upper bits of YMM registers
3229     vpxor(vec1, vec1);
3230     vpxor(vec2, vec2);
3231   }
3232 }
3233 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3234 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3235                                       Register limit, Register result, Register chr,
3236                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
3237   ShortBranchVerifier sbv(this);
3238   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3239 
3240   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3241   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3242 
3243   if (is_array_equ) {
3244     // Check the input args
3245     cmpoop(ary1, ary2);
3246     jcc(Assembler::equal, TRUE_LABEL);
3247 
3248     // Need additional checks for arrays_equals.
3249     testptr(ary1, ary1);
3250     jcc(Assembler::zero, FALSE_LABEL);
3251     testptr(ary2, ary2);
3252     jcc(Assembler::zero, FALSE_LABEL);
3253 
3254     // Check the lengths
3255     movl(limit, Address(ary1, length_offset));
3256     cmpl(limit, Address(ary2, length_offset));
3257     jcc(Assembler::notEqual, FALSE_LABEL);
3258   }
3259 
3260   // count == 0
3261   testl(limit, limit);
3262   jcc(Assembler::zero, TRUE_LABEL);
3263 
3264   if (is_array_equ) {
3265     // Load array address
3266     lea(ary1, Address(ary1, base_offset));
3267     lea(ary2, Address(ary2, base_offset));
3268   }
3269 
3270   if (is_array_equ && is_char) {
3271     // arrays_equals when used for char[].
3272     shll(limit, 1);      // byte count != 0
3273   }
3274   movl(result, limit); // copy
3275 
3276   if (UseAVX >= 2) {
3277     // With AVX2, use 32-byte vector compare
3278     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3279 
3280     // Compare 32-byte vectors
3281     andl(result, 0x0000001f);  //   tail count (in bytes)
3282     andl(limit, 0xffffffe0);   // vector count (in bytes)
3283     jcc(Assembler::zero, COMPARE_TAIL);
3284 
3285     lea(ary1, Address(ary1, limit, Address::times_1));
3286     lea(ary2, Address(ary2, limit, Address::times_1));
3287     negptr(limit);
3288 
3289 #ifdef _LP64
3290     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3291       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3292 
3293       cmpl(limit, -64);
3294       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3295 
3296       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3297 
3298       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3299       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3300       kortestql(k7, k7);
3301       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3302       addptr(limit, 64);  // update since we already compared at this addr
3303       cmpl(limit, -64);
3304       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3305 
3306       // At this point we may still need to compare -limit+result bytes.
3307       // We could execute the next two instruction and just continue via non-wide path:
3308       //  cmpl(limit, 0);
3309       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3310       // But since we stopped at the points ary{1,2}+limit which are
3311       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3312       // (|limit| <= 32 and result < 32),
3313       // we may just compare the last 64 bytes.
3314       //
3315       addptr(result, -64);   // it is safe, bc we just came from this area
3316       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3317       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3318       kortestql(k7, k7);
3319       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3320 
3321       jmp(TRUE_LABEL);
3322 
3323       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3324 
3325     }//if (VM_Version::supports_avx512vlbw())
3326 #endif //_LP64
3327     bind(COMPARE_WIDE_VECTORS);
3328     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3329     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3330     vpxor(vec1, vec2);
3331 
3332     vptest(vec1, vec1);
3333     jcc(Assembler::notZero, FALSE_LABEL);
3334     addptr(limit, 32);
3335     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3336 
3337     testl(result, result);
3338     jcc(Assembler::zero, TRUE_LABEL);
3339 
3340     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3341     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3342     vpxor(vec1, vec2);
3343 
3344     vptest(vec1, vec1);
3345     jccb(Assembler::notZero, FALSE_LABEL);
3346     jmpb(TRUE_LABEL);
3347 
3348     bind(COMPARE_TAIL); // limit is zero
3349     movl(limit, result);
3350     // Fallthru to tail compare
3351   } else if (UseSSE42Intrinsics) {
3352     // With SSE4.2, use double quad vector compare
3353     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3354 
3355     // Compare 16-byte vectors
3356     andl(result, 0x0000000f);  //   tail count (in bytes)
3357     andl(limit, 0xfffffff0);   // vector count (in bytes)
3358     jcc(Assembler::zero, COMPARE_TAIL);
3359 
3360     lea(ary1, Address(ary1, limit, Address::times_1));
3361     lea(ary2, Address(ary2, limit, Address::times_1));
3362     negptr(limit);
3363 
3364     bind(COMPARE_WIDE_VECTORS);
3365     movdqu(vec1, Address(ary1, limit, Address::times_1));
3366     movdqu(vec2, Address(ary2, limit, Address::times_1));
3367     pxor(vec1, vec2);
3368 
3369     ptest(vec1, vec1);
3370     jcc(Assembler::notZero, FALSE_LABEL);
3371     addptr(limit, 16);
3372     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3373 
3374     testl(result, result);
3375     jcc(Assembler::zero, TRUE_LABEL);
3376 
3377     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3378     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3379     pxor(vec1, vec2);
3380 
3381     ptest(vec1, vec1);
3382     jccb(Assembler::notZero, FALSE_LABEL);
3383     jmpb(TRUE_LABEL);
3384 
3385     bind(COMPARE_TAIL); // limit is zero
3386     movl(limit, result);
3387     // Fallthru to tail compare
3388   }
3389 
3390   // Compare 4-byte vectors
3391   andl(limit, 0xfffffffc); // vector count (in bytes)
3392   jccb(Assembler::zero, COMPARE_CHAR);
3393 
3394   lea(ary1, Address(ary1, limit, Address::times_1));
3395   lea(ary2, Address(ary2, limit, Address::times_1));
3396   negptr(limit);
3397 
3398   bind(COMPARE_VECTORS);
3399   movl(chr, Address(ary1, limit, Address::times_1));
3400   cmpl(chr, Address(ary2, limit, Address::times_1));
3401   jccb(Assembler::notEqual, FALSE_LABEL);
3402   addptr(limit, 4);
3403   jcc(Assembler::notZero, COMPARE_VECTORS);
3404 
3405   // Compare trailing char (final 2 bytes), if any
3406   bind(COMPARE_CHAR);
3407   testl(result, 0x2);   // tail  char
3408   jccb(Assembler::zero, COMPARE_BYTE);
3409   load_unsigned_short(chr, Address(ary1, 0));
3410   load_unsigned_short(limit, Address(ary2, 0));
3411   cmpl(chr, limit);
3412   jccb(Assembler::notEqual, FALSE_LABEL);
3413 
3414   if (is_array_equ && is_char) {
3415     bind(COMPARE_BYTE);
3416   } else {
3417     lea(ary1, Address(ary1, 2));
3418     lea(ary2, Address(ary2, 2));
3419 
3420     bind(COMPARE_BYTE);
3421     testl(result, 0x1);   // tail  byte
3422     jccb(Assembler::zero, TRUE_LABEL);
3423     load_unsigned_byte(chr, Address(ary1, 0));
3424     load_unsigned_byte(limit, Address(ary2, 0));
3425     cmpl(chr, limit);
3426     jccb(Assembler::notEqual, FALSE_LABEL);
3427   }
3428   bind(TRUE_LABEL);
3429   movl(result, 1);   // return true
3430   jmpb(DONE);
3431 
3432   bind(FALSE_LABEL);
3433   xorl(result, result); // return false
3434 
3435   // That's it
3436   bind(DONE);
3437   if (UseAVX >= 2) {
3438     // clean upper bits of YMM registers
3439     vpxor(vec1, vec1);
3440     vpxor(vec2, vec2);
3441   }
3442 }