1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
  37   switch (vlen_in_bytes) {
  38     case  4: // fall-through
  39     case  8: // fall-through
  40     case 16: return Assembler::AVX_128bit;
  41     case 32: return Assembler::AVX_256bit;
  42     case 64: return Assembler::AVX_512bit;
  43 
  44     default: {
  45       ShouldNotReachHere();
  46       return Assembler::AVX_NoVec;
  47     }
  48   }
  49 }
  50 
  51 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  52   guarantee(PostLoopMultiversioning, "must be");
  53   Assembler::movl(dst, 1);
  54   Assembler::shlxl(dst, dst, src);
  55   Assembler::decl(dst);
  56   Assembler::kmovdl(k1, dst);
  57   Assembler::movl(dst, src);
  58 }
  59 
  60 void C2_MacroAssembler::restorevectmask() {
  61   guarantee(PostLoopMultiversioning, "must be");
  62   Assembler::knotwl(k1, k0);
  63 }
  64 
  65 #if INCLUDE_RTM_OPT
  66 
  67 // Update rtm_counters based on abort status
  68 // input: abort_status
  69 //        rtm_counters (RTMLockingCounters*)
  70 // flags are killed
  71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  72 
  73   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  74   if (PrintPreciseRTMLockingStatistics) {
  75     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  76       Label check_abort;
  77       testl(abort_status, (1<<i));
  78       jccb(Assembler::equal, check_abort);
  79       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  80       bind(check_abort);
  81     }
  82   }
  83 }
  84 
  85 // Branch if (random & (count-1) != 0), count is 2^n
  86 // tmp, scr and flags are killed
  87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  88   assert(tmp == rax, "");
  89   assert(scr == rdx, "");
  90   rdtsc(); // modifies EDX:EAX
  91   andptr(tmp, count-1);
  92   jccb(Assembler::notZero, brLabel);
  93 }
  94 
  95 // Perform abort ratio calculation, set no_rtm bit if high ratio
  96 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  97 // tmpReg, rtm_counters_Reg and flags are killed
  98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  99                                                     Register rtm_counters_Reg,
 100                                                     RTMLockingCounters* rtm_counters,
 101                                                     Metadata* method_data) {
 102   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 103 
 104   if (RTMLockingCalculationDelay > 0) {
 105     // Delay calculation
 106     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 107     testptr(tmpReg, tmpReg);
 108     jccb(Assembler::equal, L_done);
 109   }
 110   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 111   //   Aborted transactions = abort_count * 100
 112   //   All transactions = total_count *  RTMTotalCountIncrRate
 113   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 114 
 115   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 116   cmpptr(tmpReg, RTMAbortThreshold);
 117   jccb(Assembler::below, L_check_always_rtm2);
 118   imulptr(tmpReg, tmpReg, 100);
 119 
 120   Register scrReg = rtm_counters_Reg;
 121   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 122   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 123   imulptr(scrReg, scrReg, RTMAbortRatio);
 124   cmpptr(tmpReg, scrReg);
 125   jccb(Assembler::below, L_check_always_rtm1);
 126   if (method_data != NULL) {
 127     // set rtm_state to "no rtm" in MDO
 128     mov_metadata(tmpReg, method_data);
 129     lock();
 130     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 131   }
 132   jmpb(L_done);
 133   bind(L_check_always_rtm1);
 134   // Reload RTMLockingCounters* address
 135   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 136   bind(L_check_always_rtm2);
 137   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 138   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 139   jccb(Assembler::below, L_done);
 140   if (method_data != NULL) {
 141     // set rtm_state to "always rtm" in MDO
 142     mov_metadata(tmpReg, method_data);
 143     lock();
 144     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 145   }
 146   bind(L_done);
 147 }
 148 
 149 // Update counters and perform abort ratio calculation
 150 // input:  abort_status_Reg
 151 // rtm_counters_Reg, flags are killed
 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 153                                       Register rtm_counters_Reg,
 154                                       RTMLockingCounters* rtm_counters,
 155                                       Metadata* method_data,
 156                                       bool profile_rtm) {
 157 
 158   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 159   // update rtm counters based on rax value at abort
 160   // reads abort_status_Reg, updates flags
 161   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 162   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 163   if (profile_rtm) {
 164     // Save abort status because abort_status_Reg is used by following code.
 165     if (RTMRetryCount > 0) {
 166       push(abort_status_Reg);
 167     }
 168     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 169     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 170     // restore abort status
 171     if (RTMRetryCount > 0) {
 172       pop(abort_status_Reg);
 173     }
 174   }
 175 }
 176 
 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 178 // inputs: retry_count_Reg
 179 //       : abort_status_Reg
 180 // output: retry_count_Reg decremented by 1
 181 // flags are killed
 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 183   Label doneRetry;
 184   assert(abort_status_Reg == rax, "");
 185   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 186   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 187   // if reason is in 0x6 and retry count != 0 then retry
 188   andptr(abort_status_Reg, 0x6);
 189   jccb(Assembler::zero, doneRetry);
 190   testl(retry_count_Reg, retry_count_Reg);
 191   jccb(Assembler::zero, doneRetry);
 192   pause();
 193   decrementl(retry_count_Reg);
 194   jmp(retryLabel);
 195   bind(doneRetry);
 196 }
 197 
 198 // Spin and retry if lock is busy,
 199 // inputs: box_Reg (monitor address)
 200 //       : retry_count_Reg
 201 // output: retry_count_Reg decremented by 1
 202 //       : clear z flag if retry count exceeded
 203 // tmp_Reg, scr_Reg, flags are killed
 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 205                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 206   Label SpinLoop, SpinExit, doneRetry;
 207   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 208 
 209   testl(retry_count_Reg, retry_count_Reg);
 210   jccb(Assembler::zero, doneRetry);
 211   decrementl(retry_count_Reg);
 212   movptr(scr_Reg, RTMSpinLoopCount);
 213 
 214   bind(SpinLoop);
 215   pause();
 216   decrementl(scr_Reg);
 217   jccb(Assembler::lessEqual, SpinExit);
 218   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 219   testptr(tmp_Reg, tmp_Reg);
 220   jccb(Assembler::notZero, SpinLoop);
 221 
 222   bind(SpinExit);
 223   jmp(retryLabel);
 224   bind(doneRetry);
 225   incrementl(retry_count_Reg); // clear z flag
 226 }
 227 
 228 // Use RTM for normal stack locks
 229 // Input: objReg (object to lock)
 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 231                                          Register retry_on_abort_count_Reg,
 232                                          RTMLockingCounters* stack_rtm_counters,
 233                                          Metadata* method_data, bool profile_rtm,
 234                                          Label& DONE_LABEL, Label& IsInflated) {
 235   assert(UseRTMForStackLocks, "why call this otherwise?");
 236   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 237   assert(tmpReg == rax, "");
 238   assert(scrReg == rdx, "");
 239   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 240 
 241   if (RTMRetryCount > 0) {
 242     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 243     bind(L_rtm_retry);
 244   }
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 246   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 247   jcc(Assembler::notZero, IsInflated);
 248 
 249   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 250     Label L_noincrement;
 251     if (RTMTotalCountIncrRate > 1) {
 252       // tmpReg, scrReg and flags are killed
 253       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 254     }
 255     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 256     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 257     bind(L_noincrement);
 258   }
 259   xbegin(L_on_abort);
 260   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 261   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 262   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 263   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 264 
 265   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 266   if (UseRTMXendForLockBusy) {
 267     xend();
 268     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 269     jmp(L_decrement_retry);
 270   }
 271   else {
 272     xabort(0);
 273   }
 274   bind(L_on_abort);
 275   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 276     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 277   }
 278   bind(L_decrement_retry);
 279   if (RTMRetryCount > 0) {
 280     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 281     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 282   }
 283 }
 284 
 285 // Use RTM for inflating locks
 286 // inputs: objReg (object to lock)
 287 //         boxReg (on-stack box address (displaced header location) - KILLED)
 288 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 290                                             Register scrReg, Register retry_on_busy_count_Reg,
 291                                             Register retry_on_abort_count_Reg,
 292                                             RTMLockingCounters* rtm_counters,
 293                                             Metadata* method_data, bool profile_rtm,
 294                                             Label& DONE_LABEL) {
 295   assert(UseRTMLocking, "why call this otherwise?");
 296   assert(tmpReg == rax, "");
 297   assert(scrReg == rdx, "");
 298   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 299   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 300 
 301   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 302   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 303   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 304 
 305   if (RTMRetryCount > 0) {
 306     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 307     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 308     bind(L_rtm_retry);
 309   }
 310   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 311     Label L_noincrement;
 312     if (RTMTotalCountIncrRate > 1) {
 313       // tmpReg, scrReg and flags are killed
 314       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 315     }
 316     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 317     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 318     bind(L_noincrement);
 319   }
 320   xbegin(L_on_abort);
 321   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 322   movptr(tmpReg, Address(tmpReg, owner_offset));
 323   testptr(tmpReg, tmpReg);
 324   jcc(Assembler::zero, DONE_LABEL);
 325   if (UseRTMXendForLockBusy) {
 326     xend();
 327     jmp(L_decrement_retry);
 328   }
 329   else {
 330     xabort(0);
 331   }
 332   bind(L_on_abort);
 333   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 334   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 335     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 336   }
 337   if (RTMRetryCount > 0) {
 338     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 339     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 340   }
 341 
 342   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 343   testptr(tmpReg, tmpReg) ;
 344   jccb(Assembler::notZero, L_decrement_retry) ;
 345 
 346   // Appears unlocked - try to swing _owner from null to non-null.
 347   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 348 #ifdef _LP64
 349   Register threadReg = r15_thread;
 350 #else
 351   get_thread(scrReg);
 352   Register threadReg = scrReg;
 353 #endif
 354   lock();
 355   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 356 
 357   if (RTMRetryCount > 0) {
 358     // success done else retry
 359     jccb(Assembler::equal, DONE_LABEL) ;
 360     bind(L_decrement_retry);
 361     // Spin and retry if lock is busy.
 362     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 363   }
 364   else {
 365     bind(L_decrement_retry);
 366   }
 367 }
 368 
 369 #endif //  INCLUDE_RTM_OPT
 370 
 371 // fast_lock and fast_unlock used by C2
 372 
 373 // Because the transitions from emitted code to the runtime
 374 // monitorenter/exit helper stubs are so slow it's critical that
 375 // we inline both the stack-locking fast path and the inflated fast path.
 376 //
 377 // See also: cmpFastLock and cmpFastUnlock.
 378 //
 379 // What follows is a specialized inline transliteration of the code
 380 // in enter() and exit(). If we're concerned about I$ bloat another
 381 // option would be to emit TrySlowEnter and TrySlowExit methods
 382 // at startup-time.  These methods would accept arguments as
 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 384 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 386 // In practice, however, the # of lock sites is bounded and is usually small.
 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 388 // if the processor uses simple bimodal branch predictors keyed by EIP
 389 // Since the helper routines would be called from multiple synchronization
 390 // sites.
 391 //
 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 394 // to those specialized methods.  That'd give us a mostly platform-independent
 395 // implementation that the JITs could optimize and inline at their pleasure.
 396 // Done correctly, the only time we'd need to cross to native could would be
 397 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 399 // (b) explicit barriers or fence operations.
 400 //
 401 // TODO:
 402 //
 403 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 404 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 405 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 406 //    the lock operators would typically be faster than reifying Self.
 407 //
 408 // *  Ideally I'd define the primitives as:
 409 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 410 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 411 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 412 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 413 //    Furthermore the register assignments are overconstrained, possibly resulting in
 414 //    sub-optimal code near the synchronization site.
 415 //
 416 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 417 //    Alternately, use a better sp-proximity test.
 418 //
 419 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 420 //    Either one is sufficient to uniquely identify a thread.
 421 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 422 //
 423 // *  Intrinsify notify() and notifyAll() for the common cases where the
 424 //    object is locked by the calling thread but the waitlist is empty.
 425 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 426 //
 427 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 428 //    But beware of excessive branch density on AMD Opterons.
 429 //
 430 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 431 //    or failure of the fast path.  If the fast path fails then we pass
 432 //    control to the slow path, typically in C.  In fast_lock and
 433 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 434 //    will emit a conditional branch immediately after the node.
 435 //    So we have branches to branches and lots of ICC.ZF games.
 436 //    Instead, it might be better to have C2 pass a "FailureLabel"
 437 //    into fast_lock and fast_unlock.  In the case of success, control
 438 //    will drop through the node.  ICC.ZF is undefined at exit.
 439 //    In the case of failure, the node will branch directly to the
 440 //    FailureLabel
 441 
 442 
 443 // obj: object to lock
 444 // box: on-stack box address (displaced header location) - KILLED
 445 // rax,: tmp -- KILLED
 446 // scr: tmp -- KILLED
 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 448                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 449                                  BiasedLockingCounters* counters,
 450                                  RTMLockingCounters* rtm_counters,
 451                                  RTMLockingCounters* stack_rtm_counters,
 452                                  Metadata* method_data,
 453                                  bool use_rtm, bool profile_rtm) {
 454   // Ensure the register assignments are disjoint
 455   assert(tmpReg == rax, "");
 456 
 457   if (use_rtm) {
 458     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 459   } else {
 460     assert(cx2Reg == noreg, "");
 461     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 462   }
 463 
 464   if (counters != NULL) {
 465     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 466   }
 467 
 468   // Possible cases that we'll encounter in fast_lock
 469   // ------------------------------------------------
 470   // * Inflated
 471   //    -- unlocked
 472   //    -- Locked
 473   //       = by self
 474   //       = by other
 475   // * biased
 476   //    -- by Self
 477   //    -- by other
 478   // * neutral
 479   // * stack-locked
 480   //    -- by self
 481   //       = sp-proximity test hits
 482   //       = sp-proximity test generates false-negative
 483   //    -- by other
 484   //
 485 
 486   Label IsInflated, DONE_LABEL;
 487 
 488   // it's stack-locked, biased or neutral
 489   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 490   // order to reduce the number of conditional branches in the most common cases.
 491   // Beware -- there's a subtle invariant that fetch of the markword
 492   // at [FETCH], below, will never observe a biased encoding (*101b).
 493   // If this invariant is not held we risk exclusion (safety) failure.
 494   if (UseBiasedLocking && !UseOptoBiasInlining) {
 495     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 496   }
 497 
 498 #if INCLUDE_RTM_OPT
 499   if (UseRTMForStackLocks && use_rtm) {
 500     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 501                       stack_rtm_counters, method_data, profile_rtm,
 502                       DONE_LABEL, IsInflated);
 503   }
 504 #endif // INCLUDE_RTM_OPT
 505 
 506   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 507   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 508   jccb(Assembler::notZero, IsInflated);
 509 
 510   // Attempt stack-locking ...
 511   orptr (tmpReg, markWord::unlocked_value);
 512   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 513   lock();
 514   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 515   if (counters != NULL) {
 516     cond_inc32(Assembler::equal,
 517                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 518   }
 519   jcc(Assembler::equal, DONE_LABEL);           // Success
 520 
 521   // Recursive locking.
 522   // The object is stack-locked: markword contains stack pointer to BasicLock.
 523   // Locked by current thread if difference with current SP is less than one page.
 524   subptr(tmpReg, rsp);
 525   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 526   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 527   movptr(Address(boxReg, 0), tmpReg);
 528   if (counters != NULL) {
 529     cond_inc32(Assembler::equal,
 530                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 531   }
 532   jmp(DONE_LABEL);
 533 
 534   bind(IsInflated);
 535   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 536 
 537 #if INCLUDE_RTM_OPT
 538   // Use the same RTM locking code in 32- and 64-bit VM.
 539   if (use_rtm) {
 540     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 541                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 542   } else {
 543 #endif // INCLUDE_RTM_OPT
 544 
 545 #ifndef _LP64
 546   // The object is inflated.
 547 
 548   // boxReg refers to the on-stack BasicLock in the current frame.
 549   // We'd like to write:
 550   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 551   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 552   // additional latency as we have another ST in the store buffer that must drain.
 553 
 554   // avoid ST-before-CAS
 555   // register juggle because we need tmpReg for cmpxchgptr below
 556   movptr(scrReg, boxReg);
 557   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 558 
 559   // Optimistic form: consider XORL tmpReg,tmpReg
 560   movptr(tmpReg, NULL_WORD);
 561 
 562   // Appears unlocked - try to swing _owner from null to non-null.
 563   // Ideally, I'd manifest "Self" with get_thread and then attempt
 564   // to CAS the register containing Self into m->Owner.
 565   // But we don't have enough registers, so instead we can either try to CAS
 566   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 567   // we later store "Self" into m->Owner.  Transiently storing a stack address
 568   // (rsp or the address of the box) into  m->owner is harmless.
 569   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 570   lock();
 571   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 572   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 573   // If we weren't able to swing _owner from NULL to the BasicLock
 574   // then take the slow path.
 575   jccb  (Assembler::notZero, DONE_LABEL);
 576   // update _owner from BasicLock to thread
 577   get_thread (scrReg);                    // beware: clobbers ICCs
 578   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 579   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 580 
 581   // If the CAS fails we can either retry or pass control to the slow path.
 582   // We use the latter tactic.
 583   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 584   // If the CAS was successful ...
 585   //   Self has acquired the lock
 586   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 587   // Intentional fall-through into DONE_LABEL ...
 588 #else // _LP64
 589   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 590   movq(scrReg, tmpReg);
 591   xorq(tmpReg, tmpReg);
 592   lock();
 593   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 594   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 595   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 596   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 597   // Intentional fall-through into DONE_LABEL ...
 598   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 599 #endif // _LP64
 600 #if INCLUDE_RTM_OPT
 601   } // use_rtm()
 602 #endif
 603   // DONE_LABEL is a hot target - we'd really like to place it at the
 604   // start of cache line by padding with NOPs.
 605   // See the AMD and Intel software optimization manuals for the
 606   // most efficient "long" NOP encodings.
 607   // Unfortunately none of our alignment mechanisms suffice.
 608   bind(DONE_LABEL);
 609 
 610   // At DONE_LABEL the icc ZFlag is set as follows ...
 611   // fast_unlock uses the same protocol.
 612   // ZFlag == 1 -> Success
 613   // ZFlag == 0 -> Failure - force control through the slow path
 614 }
 615 
 616 // obj: object to unlock
 617 // box: box address (displaced header location), killed.  Must be EAX.
 618 // tmp: killed, cannot be obj nor box.
 619 //
 620 // Some commentary on balanced locking:
 621 //
 622 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 623 // Methods that don't have provably balanced locking are forced to run in the
 624 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 625 // The interpreter provides two properties:
 626 // I1:  At return-time the interpreter automatically and quietly unlocks any
 627 //      objects acquired the current activation (frame).  Recall that the
 628 //      interpreter maintains an on-stack list of locks currently held by
 629 //      a frame.
 630 // I2:  If a method attempts to unlock an object that is not held by the
 631 //      the frame the interpreter throws IMSX.
 632 //
 633 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 634 // B() doesn't have provably balanced locking so it runs in the interpreter.
 635 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 636 // is still locked by A().
 637 //
 638 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 639 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 640 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 641 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 642 // Arguably given that the spec legislates the JNI case as undefined our implementation
 643 // could reasonably *avoid* checking owner in fast_unlock().
 644 // In the interest of performance we elide m->Owner==Self check in unlock.
 645 // A perfectly viable alternative is to elide the owner check except when
 646 // Xcheck:jni is enabled.
 647 
 648 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 649   assert(boxReg == rax, "");
 650   assert_different_registers(objReg, boxReg, tmpReg);
 651 
 652   Label DONE_LABEL, Stacked, CheckSucc;
 653 
 654   // Critically, the biased locking test must have precedence over
 655   // and appear before the (box->dhw == 0) recursive stack-lock test.
 656   if (UseBiasedLocking && !UseOptoBiasInlining) {
 657     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 658   }
 659 
 660 #if INCLUDE_RTM_OPT
 661   if (UseRTMForStackLocks && use_rtm) {
 662     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 663     Label L_regular_unlock;
 664     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 665     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 666     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 667     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 668     xend();                                                           // otherwise end...
 669     jmp(DONE_LABEL);                                                  // ... and we're done
 670     bind(L_regular_unlock);
 671   }
 672 #endif
 673 
 674   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 675   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 676   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 677   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 678   jccb  (Assembler::zero, Stacked);
 679 
 680   // It's inflated.
 681 #if INCLUDE_RTM_OPT
 682   if (use_rtm) {
 683     Label L_regular_inflated_unlock;
 684     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 685     movptr(boxReg, Address(tmpReg, owner_offset));
 686     testptr(boxReg, boxReg);
 687     jccb(Assembler::notZero, L_regular_inflated_unlock);
 688     xend();
 689     jmpb(DONE_LABEL);
 690     bind(L_regular_inflated_unlock);
 691   }
 692 #endif
 693 
 694   // Despite our balanced locking property we still check that m->_owner == Self
 695   // as java routines or native JNI code called by this thread might
 696   // have released the lock.
 697   // Refer to the comments in synchronizer.cpp for how we might encode extra
 698   // state in _succ so we can avoid fetching EntryList|cxq.
 699   //
 700   // I'd like to add more cases in fast_lock() and fast_unlock() --
 701   // such as recursive enter and exit -- but we have to be wary of
 702   // I$ bloat, T$ effects and BP$ effects.
 703   //
 704   // If there's no contention try a 1-0 exit.  That is, exit without
 705   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 706   // we detect and recover from the race that the 1-0 exit admits.
 707   //
 708   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 709   // before it STs null into _owner, releasing the lock.  Updates
 710   // to data protected by the critical section must be visible before
 711   // we drop the lock (and thus before any other thread could acquire
 712   // the lock and observe the fields protected by the lock).
 713   // IA32's memory-model is SPO, so STs are ordered with respect to
 714   // each other and there's no need for an explicit barrier (fence).
 715   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 716 #ifndef _LP64
 717   get_thread (boxReg);
 718 
 719   // Note that we could employ various encoding schemes to reduce
 720   // the number of loads below (currently 4) to just 2 or 3.
 721   // Refer to the comments in synchronizer.cpp.
 722   // In practice the chain of fetches doesn't seem to impact performance, however.
 723   xorptr(boxReg, boxReg);
 724   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 725   jccb  (Assembler::notZero, DONE_LABEL);
 726   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 727   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 728   jccb  (Assembler::notZero, CheckSucc);
 729   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 730   jmpb  (DONE_LABEL);
 731 
 732   bind (Stacked);
 733   // It's not inflated and it's not recursively stack-locked and it's not biased.
 734   // It must be stack-locked.
 735   // Try to reset the header to displaced header.
 736   // The "box" value on the stack is stable, so we can reload
 737   // and be assured we observe the same value as above.
 738   movptr(tmpReg, Address(boxReg, 0));
 739   lock();
 740   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 741   // Intention fall-thru into DONE_LABEL
 742 
 743   // DONE_LABEL is a hot target - we'd really like to place it at the
 744   // start of cache line by padding with NOPs.
 745   // See the AMD and Intel software optimization manuals for the
 746   // most efficient "long" NOP encodings.
 747   // Unfortunately none of our alignment mechanisms suffice.
 748   bind (CheckSucc);
 749 #else // _LP64
 750   // It's inflated
 751   xorptr(boxReg, boxReg);
 752   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 753   jccb  (Assembler::notZero, DONE_LABEL);
 754   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 755   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 756   jccb  (Assembler::notZero, CheckSucc);
 757   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 758   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 759   jmpb  (DONE_LABEL);
 760 
 761   // Try to avoid passing control into the slow_path ...
 762   Label LSuccess, LGoSlowPath ;
 763   bind  (CheckSucc);
 764 
 765   // The following optional optimization can be elided if necessary
 766   // Effectively: if (succ == null) goto slow path
 767   // The code reduces the window for a race, however,
 768   // and thus benefits performance.
 769   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 770   jccb  (Assembler::zero, LGoSlowPath);
 771 
 772   xorptr(boxReg, boxReg);
 773   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 774   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 775 
 776   // Memory barrier/fence
 777   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 778   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 779   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 780   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 781   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 782   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 783   lock(); addl(Address(rsp, 0), 0);
 784 
 785   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 786   jccb  (Assembler::notZero, LSuccess);
 787 
 788   // Rare inopportune interleaving - race.
 789   // The successor vanished in the small window above.
 790   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 791   // We need to ensure progress and succession.
 792   // Try to reacquire the lock.
 793   // If that fails then the new owner is responsible for succession and this
 794   // thread needs to take no further action and can exit via the fast path (success).
 795   // If the re-acquire succeeds then pass control into the slow path.
 796   // As implemented, this latter mode is horrible because we generated more
 797   // coherence traffic on the lock *and* artifically extended the critical section
 798   // length while by virtue of passing control into the slow path.
 799 
 800   // box is really RAX -- the following CMPXCHG depends on that binding
 801   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 802   lock();
 803   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 804   // There's no successor so we tried to regrab the lock.
 805   // If that didn't work, then another thread grabbed the
 806   // lock so we're done (and exit was a success).
 807   jccb  (Assembler::notEqual, LSuccess);
 808   // Intentional fall-through into slow path
 809 
 810   bind  (LGoSlowPath);
 811   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 812   jmpb  (DONE_LABEL);
 813 
 814   bind  (LSuccess);
 815   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 816   jmpb  (DONE_LABEL);
 817 
 818   bind  (Stacked);
 819   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 820   lock();
 821   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 822 
 823 #endif
 824   bind(DONE_LABEL);
 825 }
 826 
 827 //-------------------------------------------------------------------------------------------
 828 // Generic instructions support for use in .ad files C2 code generation
 829 
 830 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 831   if (dst != src) {
 832     movdqu(dst, src);
 833   }
 834   if (opcode == Op_AbsVD) {
 835     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 836   } else {
 837     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 838     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 839   }
 840 }
 841 
 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 843   if (opcode == Op_AbsVD) {
 844     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 845   } else {
 846     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 847     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 848   }
 849 }
 850 
 851 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 852   if (dst != src) {
 853     movdqu(dst, src);
 854   }
 855   if (opcode == Op_AbsVF) {
 856     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 857   } else {
 858     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 859     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 860   }
 861 }
 862 
 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 864   if (opcode == Op_AbsVF) {
 865     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 866   } else {
 867     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 868     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 869   }
 870 }
 871 
 872 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 873   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 874 
 875   if (opcode == Op_MinV) {
 876     if (elem_bt == T_BYTE) {
 877       pminsb(dst, src);
 878     } else if (elem_bt == T_SHORT) {
 879       pminsw(dst, src);
 880     } else if (elem_bt == T_INT) {
 881       pminsd(dst, src);
 882     } else {
 883       assert(elem_bt == T_LONG, "required");
 884       assert(tmp == xmm0, "required");
 885       movdqu(xmm0, dst);
 886       pcmpgtq(xmm0, src);
 887       blendvpd(dst, src);  // xmm0 as mask
 888     }
 889   } else { // opcode == Op_MaxV
 890     if (elem_bt == T_BYTE) {
 891       pmaxsb(dst, src);
 892     } else if (elem_bt == T_SHORT) {
 893       pmaxsw(dst, src);
 894     } else if (elem_bt == T_INT) {
 895       pmaxsd(dst, src);
 896     } else {
 897       assert(elem_bt == T_LONG, "required");
 898       assert(tmp == xmm0, "required");
 899       movdqu(xmm0, src);
 900       pcmpgtq(xmm0, dst);
 901       blendvpd(dst, src);  // xmm0 as mask
 902     }
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
 907                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
 908                                  int vlen_enc) {
 909   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 910 
 911   if (opcode == Op_MinV) {
 912     if (elem_bt == T_BYTE) {
 913       vpminsb(dst, src1, src2, vlen_enc);
 914     } else if (elem_bt == T_SHORT) {
 915       vpminsw(dst, src1, src2, vlen_enc);
 916     } else if (elem_bt == T_INT) {
 917       vpminsd(dst, src1, src2, vlen_enc);
 918     } else {
 919       assert(elem_bt == T_LONG, "required");
 920       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 921         vpminsq(dst, src1, src2, vlen_enc);
 922       } else {
 923         vpcmpgtq(dst, src1, src2, vlen_enc);
 924         vblendvpd(dst, src1, src2, dst, vlen_enc);
 925       }
 926     }
 927   } else { // opcode == Op_MaxV
 928     if (elem_bt == T_BYTE) {
 929       vpmaxsb(dst, src1, src2, vlen_enc);
 930     } else if (elem_bt == T_SHORT) {
 931       vpmaxsw(dst, src1, src2, vlen_enc);
 932     } else if (elem_bt == T_INT) {
 933       vpmaxsd(dst, src1, src2, vlen_enc);
 934     } else {
 935       assert(elem_bt == T_LONG, "required");
 936       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
 937         vpmaxsq(dst, src1, src2, vlen_enc);
 938       } else {
 939         vpcmpgtq(dst, src1, src2, vlen_enc);
 940         vblendvpd(dst, src2, src1, dst, vlen_enc);
 941       }
 942     }
 943   }
 944 }
 945 
 946 // Float/Double min max
 947 
 948 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
 949                                    XMMRegister dst, XMMRegister a, XMMRegister b,
 950                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
 951                                    int vlen_enc) {
 952   assert(UseAVX > 0, "required");
 953   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 954          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 955   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 956 
 957   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 958   bool is_double_word = is_double_word_type(elem_bt);
 959 
 960   if (!is_double_word && is_min) {
 961     vblendvps(atmp, a, b, a, vlen_enc);
 962     vblendvps(btmp, b, a, a, vlen_enc);
 963     vminps(tmp, atmp, btmp, vlen_enc);
 964     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 965     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 966   } else if (!is_double_word && !is_min) {
 967     vblendvps(btmp, b, a, b, vlen_enc);
 968     vblendvps(atmp, a, b, b, vlen_enc);
 969     vmaxps(tmp, atmp, btmp, vlen_enc);
 970     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 971     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
 972   } else if (is_double_word && is_min) {
 973     vblendvpd(atmp, a, b, a, vlen_enc);
 974     vblendvpd(btmp, b, a, a, vlen_enc);
 975     vminpd(tmp, atmp, btmp, vlen_enc);
 976     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 977     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 978   } else {
 979     assert(is_double_word && !is_min, "sanity");
 980     vblendvpd(btmp, b, a, b, vlen_enc);
 981     vblendvpd(atmp, a, b, b, vlen_enc);
 982     vmaxpd(tmp, atmp, btmp, vlen_enc);
 983     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
 984     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
 985   }
 986 }
 987 
 988 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
 989                                     XMMRegister dst, XMMRegister a, XMMRegister b,
 990                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
 991                                     int vlen_enc) {
 992   assert(UseAVX > 2, "required");
 993   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
 994          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
 995   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
 996 
 997   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
 998   bool is_double_word = is_double_word_type(elem_bt);
 999   bool merge = true;
1000 
1001   if (!is_double_word && is_min) {
1002     evpmovd2m(ktmp, a, vlen_enc);
1003     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1004     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1005     vminps(dst, atmp, btmp, vlen_enc);
1006     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1008   } else if (!is_double_word && !is_min) {
1009     evpmovd2m(ktmp, b, vlen_enc);
1010     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012     vmaxps(dst, atmp, btmp, vlen_enc);
1013     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015   } else if (is_double_word && is_min) {
1016     evpmovq2m(ktmp, a, vlen_enc);
1017     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1018     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1019     vminpd(dst, atmp, btmp, vlen_enc);
1020     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1022   } else {
1023     assert(is_double_word && !is_min, "sanity");
1024     evpmovq2m(ktmp, b, vlen_enc);
1025     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1027     vmaxpd(dst, atmp, btmp, vlen_enc);
1028     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1030   }
1031 }
1032 
1033 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1034   if (sign) {
1035     pmovsxbw(dst, src);
1036   } else {
1037     pmovzxbw(dst, src);
1038   }
1039 }
1040 
1041 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1042   if (sign) {
1043     vpmovsxbw(dst, src, vector_len);
1044   } else {
1045     vpmovzxbw(dst, src, vector_len);
1046   }
1047 }
1048 
1049 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1050   if (sign) {
1051     vpmovsxbd(dst, src, vector_len);
1052   } else {
1053     vpmovzxbd(dst, src, vector_len);
1054   }
1055 }
1056 
1057 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1058   if (sign) {
1059     vpmovsxwd(dst, src, vector_len);
1060   } else {
1061     vpmovzxwd(dst, src, vector_len);
1062   }
1063 }
1064 
1065 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1066   switch (opcode) {
1067     case Op_RShiftVI:  psrad(dst, shift); break;
1068     case Op_LShiftVI:  pslld(dst, shift); break;
1069     case Op_URShiftVI: psrld(dst, shift); break;
1070 
1071     default: assert(false, "%s", NodeClassNames[opcode]);
1072   }
1073 }
1074 
1075 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1076   switch (opcode) {
1077     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1078     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1079     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1080 
1081     default: assert(false, "%s", NodeClassNames[opcode]);
1082   }
1083 }
1084 
1085 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1086   switch (opcode) {
1087     case Op_RShiftVB:  // fall-through
1088     case Op_RShiftVS:  psraw(dst, shift); break;
1089 
1090     case Op_LShiftVB:  // fall-through
1091     case Op_LShiftVS:  psllw(dst, shift);   break;
1092 
1093     case Op_URShiftVS: // fall-through
1094     case Op_URShiftVB: psrlw(dst, shift);  break;
1095 
1096     default: assert(false, "%s", NodeClassNames[opcode]);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1101   switch (opcode) {
1102     case Op_RShiftVB:  // fall-through
1103     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1104 
1105     case Op_LShiftVB:  // fall-through
1106     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1107 
1108     case Op_URShiftVS: // fall-through
1109     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1110 
1111     default: assert(false, "%s", NodeClassNames[opcode]);
1112   }
1113 }
1114 
1115 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1116   switch (opcode) {
1117     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1118     case Op_LShiftVL:  psllq(dst, shift); break;
1119     case Op_URShiftVL: psrlq(dst, shift); break;
1120 
1121     default: assert(false, "%s", NodeClassNames[opcode]);
1122   }
1123 }
1124 
1125 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1126   switch (opcode) {
1127     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1128     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1129     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1130 
1131     default: assert(false, "%s", NodeClassNames[opcode]);
1132   }
1133 }
1134 
1135 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1136   switch (opcode) {
1137     case Op_RShiftVB:  // fall-through
1138     case Op_RShiftVS:  // fall-through
1139     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1140 
1141     case Op_LShiftVB:  // fall-through
1142     case Op_LShiftVS:  // fall-through
1143     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1144 
1145     case Op_URShiftVB: // fall-through
1146     case Op_URShiftVS: // fall-through
1147     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1148 
1149     default: assert(false, "%s", NodeClassNames[opcode]);
1150   }
1151 }
1152 
1153 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1154   switch (opcode) {
1155     case Op_RShiftVB:  // fall-through
1156     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1157 
1158     case Op_LShiftVB:  // fall-through
1159     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1160 
1161     case Op_URShiftVB: // fall-through
1162     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1163 
1164     default: assert(false, "%s", NodeClassNames[opcode]);
1165   }
1166 }
1167 
1168 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1169   assert(UseAVX >= 2, "required");
1170   switch (opcode) {
1171     case Op_RShiftVL: {
1172       if (UseAVX > 2) {
1173         assert(tmp == xnoreg, "not used");
1174         if (!VM_Version::supports_avx512vl()) {
1175           vlen_enc = Assembler::AVX_512bit;
1176         }
1177         evpsravq(dst, src, shift, vlen_enc);
1178       } else {
1179         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1180         vpsrlvq(dst, src, shift, vlen_enc);
1181         vpsrlvq(tmp, tmp, shift, vlen_enc);
1182         vpxor(dst, dst, tmp, vlen_enc);
1183         vpsubq(dst, dst, tmp, vlen_enc);
1184       }
1185       break;
1186     }
1187     case Op_LShiftVL: {
1188       assert(tmp == xnoreg, "not used");
1189       vpsllvq(dst, src, shift, vlen_enc);
1190       break;
1191     }
1192     case Op_URShiftVL: {
1193       assert(tmp == xnoreg, "not used");
1194       vpsrlvq(dst, src, shift, vlen_enc);
1195       break;
1196     }
1197     default: assert(false, "%s", NodeClassNames[opcode]);
1198   }
1199 }
1200 
1201 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1202 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1203   assert(opcode == Op_LShiftVB ||
1204          opcode == Op_RShiftVB ||
1205          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1206   bool sign = (opcode != Op_URShiftVB);
1207   assert(vector_len == 0, "required");
1208   vextendbd(sign, dst, src, 1);
1209   vpmovzxbd(vtmp, shift, 1);
1210   varshiftd(opcode, dst, dst, vtmp, 1);
1211   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1212   vextracti128_high(vtmp, dst);
1213   vpackusdw(dst, dst, vtmp, 0);
1214 }
1215 
1216 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1217 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1218   assert(opcode == Op_LShiftVB ||
1219          opcode == Op_RShiftVB ||
1220          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1221   bool sign = (opcode != Op_URShiftVB);
1222   int ext_vector_len = vector_len + 1;
1223   vextendbw(sign, dst, src, ext_vector_len);
1224   vpmovzxbw(vtmp, shift, ext_vector_len);
1225   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1226   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1227   if (vector_len == 0) {
1228     vextracti128_high(vtmp, dst);
1229     vpackuswb(dst, dst, vtmp, vector_len);
1230   } else {
1231     vextracti64x4_high(vtmp, dst);
1232     vpackuswb(dst, dst, vtmp, vector_len);
1233     vpermq(dst, dst, 0xD8, vector_len);
1234   }
1235 }
1236 
1237 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1238   switch(typ) {
1239     case T_BYTE:
1240       pinsrb(dst, val, idx);
1241       break;
1242     case T_SHORT:
1243       pinsrw(dst, val, idx);
1244       break;
1245     case T_INT:
1246       pinsrd(dst, val, idx);
1247       break;
1248     case T_LONG:
1249       pinsrq(dst, val, idx);
1250       break;
1251     default:
1252       assert(false,"Should not reach here.");
1253       break;
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1258   switch(typ) {
1259     case T_BYTE:
1260       vpinsrb(dst, src, val, idx);
1261       break;
1262     case T_SHORT:
1263       vpinsrw(dst, src, val, idx);
1264       break;
1265     case T_INT:
1266       vpinsrd(dst, src, val, idx);
1267       break;
1268     case T_LONG:
1269       vpinsrq(dst, src, val, idx);
1270       break;
1271     default:
1272       assert(false,"Should not reach here.");
1273       break;
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1278   switch(typ) {
1279     case T_INT:
1280       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1281       break;
1282     case T_FLOAT:
1283       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1284       break;
1285     case T_LONG:
1286       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1287       break;
1288     case T_DOUBLE:
1289       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1290       break;
1291     default:
1292       assert(false,"Should not reach here.");
1293       break;
1294   }
1295 }
1296 
1297 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1298   switch(typ) {
1299     case T_INT:
1300       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1301       break;
1302     case T_FLOAT:
1303       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1304       break;
1305     case T_LONG:
1306       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1307       break;
1308     case T_DOUBLE:
1309       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1310       break;
1311     default:
1312       assert(false,"Should not reach here.");
1313       break;
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1318   switch(typ) {
1319     case T_INT:
1320       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1321       break;
1322     case T_FLOAT:
1323       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1324       break;
1325     case T_LONG:
1326       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1327       break;
1328     case T_DOUBLE:
1329       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1330       break;
1331     default:
1332       assert(false,"Should not reach here.");
1333       break;
1334   }
1335 }
1336 
1337 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1338   if (vlen_in_bytes <= 16) {
1339     pxor (dst, dst);
1340     psubb(dst, src);
1341     switch (elem_bt) {
1342       case T_BYTE:   /* nothing to do */ break;
1343       case T_SHORT:  pmovsxbw(dst, dst); break;
1344       case T_INT:    pmovsxbd(dst, dst); break;
1345       case T_FLOAT:  pmovsxbd(dst, dst); break;
1346       case T_LONG:   pmovsxbq(dst, dst); break;
1347       case T_DOUBLE: pmovsxbq(dst, dst); break;
1348 
1349       default: assert(false, "%s", type2name(elem_bt));
1350     }
1351   } else {
1352     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1353 
1354     vpxor (dst, dst, dst, vlen_enc);
1355     vpsubb(dst, dst, src, vlen_enc);
1356     switch (elem_bt) {
1357       case T_BYTE:   /* nothing to do */            break;
1358       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1359       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1360       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1361       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1362       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1363 
1364       default: assert(false, "%s", type2name(elem_bt));
1365     }
1366   }
1367 }
1368 
1369 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1370   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1371   if (vlen_in_bytes <= 16) {
1372     movdqu(dst, addr, scratch);
1373   } else if (vlen_in_bytes == 32) {
1374     vmovdqu(dst, addr, scratch);
1375   } else {
1376     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1377     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1378   }
1379 }
1380 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1381 
1382 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1383   int vector_len = Assembler::AVX_128bit;
1384 
1385   switch (opcode) {
1386     case Op_AndReductionV:  pand(dst, src); break;
1387     case Op_OrReductionV:   por (dst, src); break;
1388     case Op_XorReductionV:  pxor(dst, src); break;
1389     case Op_MinReductionV:
1390       switch (typ) {
1391         case T_BYTE:        pminsb(dst, src); break;
1392         case T_SHORT:       pminsw(dst, src); break;
1393         case T_INT:         pminsd(dst, src); break;
1394         case T_LONG:        assert(UseAVX > 2, "required");
1395                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1396         default:            assert(false, "wrong type");
1397       }
1398       break;
1399     case Op_MaxReductionV:
1400       switch (typ) {
1401         case T_BYTE:        pmaxsb(dst, src); break;
1402         case T_SHORT:       pmaxsw(dst, src); break;
1403         case T_INT:         pmaxsd(dst, src); break;
1404         case T_LONG:        assert(UseAVX > 2, "required");
1405                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1406         default:            assert(false, "wrong type");
1407       }
1408       break;
1409     case Op_AddReductionVF: addss(dst, src); break;
1410     case Op_AddReductionVD: addsd(dst, src); break;
1411     case Op_AddReductionVI:
1412       switch (typ) {
1413         case T_BYTE:        paddb(dst, src); break;
1414         case T_SHORT:       paddw(dst, src); break;
1415         case T_INT:         paddd(dst, src); break;
1416         default:            assert(false, "wrong type");
1417       }
1418       break;
1419     case Op_AddReductionVL: paddq(dst, src); break;
1420     case Op_MulReductionVF: mulss(dst, src); break;
1421     case Op_MulReductionVD: mulsd(dst, src); break;
1422     case Op_MulReductionVI:
1423       switch (typ) {
1424         case T_SHORT:       pmullw(dst, src); break;
1425         case T_INT:         pmulld(dst, src); break;
1426         default:            assert(false, "wrong type");
1427       }
1428       break;
1429     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1430                             vpmullq(dst, dst, src, vector_len); break;
1431     default:                assert(false, "wrong opcode");
1432   }
1433 }
1434 
1435 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1436   int vector_len = Assembler::AVX_256bit;
1437 
1438   switch (opcode) {
1439     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1440     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1441     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1442     case Op_MinReductionV:
1443       switch (typ) {
1444         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1445         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1446         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1447         case T_LONG:        assert(UseAVX > 2, "required");
1448                             vpminsq(dst, src1, src2, vector_len); break;
1449         default:            assert(false, "wrong type");
1450       }
1451       break;
1452     case Op_MaxReductionV:
1453       switch (typ) {
1454         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1455         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1456         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1457         case T_LONG:        assert(UseAVX > 2, "required");
1458                             vpmaxsq(dst, src1, src2, vector_len); break;
1459         default:            assert(false, "wrong type");
1460       }
1461       break;
1462     case Op_AddReductionVI:
1463       switch (typ) {
1464         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1465         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1466         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1467         default:            assert(false, "wrong type");
1468       }
1469       break;
1470     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1471     case Op_MulReductionVI:
1472       switch (typ) {
1473         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1474         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1475         default:            assert(false, "wrong type");
1476       }
1477       break;
1478     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1479     default:                assert(false, "wrong opcode");
1480   }
1481 }
1482 
1483 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1484                                   XMMRegister dst, XMMRegister src,
1485                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1486   switch (opcode) {
1487     case Op_AddReductionVF:
1488     case Op_MulReductionVF:
1489       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1490       break;
1491 
1492     case Op_AddReductionVD:
1493     case Op_MulReductionVD:
1494       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1495       break;
1496 
1497     default: assert(false, "wrong opcode");
1498   }
1499 }
1500 
1501 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1502                              Register dst, Register src1, XMMRegister src2,
1503                              XMMRegister vtmp1, XMMRegister vtmp2) {
1504   switch (vlen) {
1505     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1506     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1507     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1508     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1509 
1510     default: assert(false, "wrong vector length");
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1515                              Register dst, Register src1, XMMRegister src2,
1516                              XMMRegister vtmp1, XMMRegister vtmp2) {
1517   switch (vlen) {
1518     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1519     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1520     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1521     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1522 
1523     default: assert(false, "wrong vector length");
1524   }
1525 }
1526 
1527 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1528                              Register dst, Register src1, XMMRegister src2,
1529                              XMMRegister vtmp1, XMMRegister vtmp2) {
1530   switch (vlen) {
1531     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1532     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1533     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1534     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1535 
1536     default: assert(false, "wrong vector length");
1537   }
1538 }
1539 
1540 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1541                              Register dst, Register src1, XMMRegister src2,
1542                              XMMRegister vtmp1, XMMRegister vtmp2) {
1543   switch (vlen) {
1544     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1545     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1546     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1547     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1548 
1549     default: assert(false, "wrong vector length");
1550   }
1551 }
1552 
1553 #ifdef _LP64
1554 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1555                              Register dst, Register src1, XMMRegister src2,
1556                              XMMRegister vtmp1, XMMRegister vtmp2) {
1557   switch (vlen) {
1558     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1559     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1560     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1561 
1562     default: assert(false, "wrong vector length");
1563   }
1564 }
1565 #endif // _LP64
1566 
1567 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1568   switch (vlen) {
1569     case 2:
1570       assert(vtmp2 == xnoreg, "");
1571       reduce2F(opcode, dst, src, vtmp1);
1572       break;
1573     case 4:
1574       assert(vtmp2 == xnoreg, "");
1575       reduce4F(opcode, dst, src, vtmp1);
1576       break;
1577     case 8:
1578       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1579       break;
1580     case 16:
1581       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1582       break;
1583     default: assert(false, "wrong vector length");
1584   }
1585 }
1586 
1587 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1588   switch (vlen) {
1589     case 2:
1590       assert(vtmp2 == xnoreg, "");
1591       reduce2D(opcode, dst, src, vtmp1);
1592       break;
1593     case 4:
1594       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1595       break;
1596     case 8:
1597       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1598       break;
1599     default: assert(false, "wrong vector length");
1600   }
1601 }
1602 
1603 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1604   if (opcode == Op_AddReductionVI) {
1605     if (vtmp1 != src2) {
1606       movdqu(vtmp1, src2);
1607     }
1608     phaddd(vtmp1, vtmp1);
1609   } else {
1610     pshufd(vtmp1, src2, 0x1);
1611     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1612   }
1613   movdl(vtmp2, src1);
1614   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1615   movdl(dst, vtmp1);
1616 }
1617 
1618 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1619   if (opcode == Op_AddReductionVI) {
1620     if (vtmp1 != src2) {
1621       movdqu(vtmp1, src2);
1622     }
1623     phaddd(vtmp1, src2);
1624     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1625   } else {
1626     pshufd(vtmp2, src2, 0xE);
1627     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1628     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1629   }
1630 }
1631 
1632 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1633   if (opcode == Op_AddReductionVI) {
1634     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1635     vextracti128_high(vtmp2, vtmp1);
1636     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1637     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1638   } else {
1639     vextracti128_high(vtmp1, src2);
1640     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1641     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1642   }
1643 }
1644 
1645 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1646   vextracti64x4_high(vtmp2, src2);
1647   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1648   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1649 }
1650 
1651 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1652   pshufd(vtmp2, src2, 0x1);
1653   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1654   movdqu(vtmp1, vtmp2);
1655   psrldq(vtmp1, 2);
1656   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1657   movdqu(vtmp2, vtmp1);
1658   psrldq(vtmp2, 1);
1659   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1660   movdl(vtmp2, src1);
1661   pmovsxbd(vtmp1, vtmp1);
1662   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1663   pextrb(dst, vtmp1, 0x0);
1664   movsbl(dst, dst);
1665 }
1666 
1667 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1668   pshufd(vtmp1, src2, 0xE);
1669   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1670   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1671 }
1672 
1673 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1674   vextracti128_high(vtmp2, src2);
1675   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1676   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1677 }
1678 
1679 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1680   vextracti64x4_high(vtmp1, src2);
1681   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1682   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1683 }
1684 
1685 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1686   pmovsxbw(vtmp2, src2);
1687   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1688 }
1689 
1690 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1691   if (UseAVX > 1) {
1692     int vector_len = Assembler::AVX_256bit;
1693     vpmovsxbw(vtmp1, src2, vector_len);
1694     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1695   } else {
1696     pmovsxbw(vtmp2, src2);
1697     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1698     pshufd(vtmp2, src2, 0x1);
1699     pmovsxbw(vtmp2, src2);
1700     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1701   }
1702 }
1703 
1704 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1705   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1706     int vector_len = Assembler::AVX_512bit;
1707     vpmovsxbw(vtmp1, src2, vector_len);
1708     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1709   } else {
1710     assert(UseAVX >= 2,"Should not reach here.");
1711     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1712     vextracti128_high(vtmp2, src2);
1713     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1714   }
1715 }
1716 
1717 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1718   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1719   vextracti64x4_high(vtmp2, src2);
1720   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1721 }
1722 
1723 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1724   if (opcode == Op_AddReductionVI) {
1725     if (vtmp1 != src2) {
1726       movdqu(vtmp1, src2);
1727     }
1728     phaddw(vtmp1, vtmp1);
1729     phaddw(vtmp1, vtmp1);
1730   } else {
1731     pshufd(vtmp2, src2, 0x1);
1732     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1733     movdqu(vtmp1, vtmp2);
1734     psrldq(vtmp1, 2);
1735     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1736   }
1737   movdl(vtmp2, src1);
1738   pmovsxwd(vtmp1, vtmp1);
1739   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1740   pextrw(dst, vtmp1, 0x0);
1741   movswl(dst, dst);
1742 }
1743 
1744 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1745   if (opcode == Op_AddReductionVI) {
1746     if (vtmp1 != src2) {
1747       movdqu(vtmp1, src2);
1748     }
1749     phaddw(vtmp1, src2);
1750   } else {
1751     pshufd(vtmp1, src2, 0xE);
1752     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1753   }
1754   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1755 }
1756 
1757 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1758   if (opcode == Op_AddReductionVI) {
1759     int vector_len = Assembler::AVX_256bit;
1760     vphaddw(vtmp2, src2, src2, vector_len);
1761     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1762   } else {
1763     vextracti128_high(vtmp2, src2);
1764     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1765   }
1766   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1767 }
1768 
1769 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1770   int vector_len = Assembler::AVX_256bit;
1771   vextracti64x4_high(vtmp1, src2);
1772   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1773   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1774 }
1775 
1776 #ifdef _LP64
1777 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1778   pshufd(vtmp2, src2, 0xE);
1779   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1780   movdq(vtmp1, src1);
1781   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1782   movdq(dst, vtmp1);
1783 }
1784 
1785 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1786   vextracti128_high(vtmp1, src2);
1787   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1788   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1789 }
1790 
1791 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1792   vextracti64x4_high(vtmp2, src2);
1793   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1794   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1795 }
1796 #endif // _LP64
1797 
1798 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1799   reduce_operation_128(T_FLOAT, opcode, dst, src);
1800   pshufd(vtmp, src, 0x1);
1801   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1802 }
1803 
1804 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1805   reduce2F(opcode, dst, src, vtmp);
1806   pshufd(vtmp, src, 0x2);
1807   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1808   pshufd(vtmp, src, 0x3);
1809   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1810 }
1811 
1812 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1813   reduce4F(opcode, dst, src, vtmp2);
1814   vextractf128_high(vtmp2, src);
1815   reduce4F(opcode, dst, vtmp2, vtmp1);
1816 }
1817 
1818 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1819   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1820   vextracti64x4_high(vtmp1, src);
1821   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1822 }
1823 
1824 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1825   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1826   pshufd(vtmp, src, 0xE);
1827   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1828 }
1829 
1830 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1831   reduce2D(opcode, dst, src, vtmp2);
1832   vextractf128_high(vtmp2, src);
1833   reduce2D(opcode, dst, vtmp2, vtmp1);
1834 }
1835 
1836 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1837   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1838   vextracti64x4_high(vtmp1, src);
1839   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1840 }
1841 
1842 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1843                                           XMMRegister dst, XMMRegister src,
1844                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1845                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1846   int permconst[] = {1, 14};
1847   XMMRegister wsrc = src;
1848   XMMRegister wdst = xmm_0;
1849   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1850 
1851   int vlen_enc = Assembler::AVX_128bit;
1852   if (vlen == 16) {
1853     vlen_enc = Assembler::AVX_256bit;
1854   }
1855 
1856   for (int i = log2(vlen) - 1; i >=0; i--) {
1857     if (i == 0 && !is_dst_valid) {
1858       wdst = dst;
1859     }
1860     if (i == 3) {
1861       vextracti64x4_high(wtmp, wsrc);
1862     } else if (i == 2) {
1863       vextracti128_high(wtmp, wsrc);
1864     } else { // i = [0,1]
1865       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1866     }
1867     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1868     wsrc = wdst;
1869     vlen_enc = Assembler::AVX_128bit;
1870   }
1871   if (is_dst_valid) {
1872     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1873   }
1874 }
1875 
1876 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1877                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1878                                         XMMRegister xmm_0, XMMRegister xmm_1) {
1879   XMMRegister wsrc = src;
1880   XMMRegister wdst = xmm_0;
1881   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1882   int vlen_enc = Assembler::AVX_128bit;
1883   if (vlen == 8) {
1884     vlen_enc = Assembler::AVX_256bit;
1885   }
1886   for (int i = log2(vlen) - 1; i >=0; i--) {
1887     if (i == 0 && !is_dst_valid) {
1888       wdst = dst;
1889     }
1890     if (i == 1) {
1891       vextracti128_high(wtmp, wsrc);
1892     } else if (i == 2) {
1893       vextracti64x4_high(wtmp, wsrc);
1894     } else {
1895       assert(i == 0, "%d", i);
1896       vpermilpd(wtmp, wsrc, 1, vlen_enc);
1897     }
1898     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1899     wsrc = wdst;
1900     vlen_enc = Assembler::AVX_128bit;
1901   }
1902   if (is_dst_valid) {
1903     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1904   }
1905 }
1906 
1907 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
1908   switch (bt) {
1909     case T_BYTE:  pextrb(dst, src, idx); break;
1910     case T_SHORT: pextrw(dst, src, idx); break;
1911     case T_INT:   pextrd(dst, src, idx); break;
1912     case T_LONG:  pextrq(dst, src, idx); break;
1913 
1914     default:
1915       assert(false,"Should not reach here.");
1916       break;
1917   }
1918 }
1919 
1920 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
1921   int esize =  type2aelembytes(typ);
1922   int elem_per_lane = 16/esize;
1923   int lane = elemindex / elem_per_lane;
1924   int eindex = elemindex % elem_per_lane;
1925 
1926   if (lane >= 2) {
1927     assert(UseAVX > 2, "required");
1928     vextractf32x4(dst, src, lane & 3);
1929     return dst;
1930   } else if (lane > 0) {
1931     assert(UseAVX > 0, "required");
1932     vextractf128(dst, src, lane);
1933     return dst;
1934   } else {
1935     return src;
1936   }
1937 }
1938 
1939 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
1940   int esize =  type2aelembytes(typ);
1941   int elem_per_lane = 16/esize;
1942   int eindex = elemindex % elem_per_lane;
1943   assert(is_integral_type(typ),"required");
1944 
1945   if (eindex == 0) {
1946     if (typ == T_LONG) {
1947       movq(dst, src);
1948     } else {
1949       movdl(dst, src);
1950       if (typ == T_BYTE)
1951         movsbl(dst, dst);
1952       else if (typ == T_SHORT)
1953         movswl(dst, dst);
1954     }
1955   } else {
1956     extract(typ, dst, src, eindex);
1957   }
1958 }
1959 
1960 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
1961   int esize =  type2aelembytes(typ);
1962   int elem_per_lane = 16/esize;
1963   int eindex = elemindex % elem_per_lane;
1964   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
1965 
1966   if (eindex == 0) {
1967     movq(dst, src);
1968   } else {
1969     if (typ == T_FLOAT) {
1970       if (UseAVX == 0) {
1971         movdqu(dst, src);
1972         pshufps(dst, dst, eindex);
1973       } else {
1974         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
1975       }
1976     } else {
1977       if (UseAVX == 0) {
1978         movdqu(dst, src);
1979         psrldq(dst, eindex*esize);
1980       } else {
1981         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
1982       }
1983       movq(dst, dst);
1984     }
1985   }
1986   // Zero upper bits
1987   if (typ == T_FLOAT) {
1988     if (UseAVX == 0) {
1989       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
1990       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
1991       pand(dst, vtmp);
1992     } else {
1993       assert((tmp != noreg), "required.");
1994       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
1995     }
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2000   switch(typ) {
2001     case T_BYTE:
2002       evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2003       break;
2004     case T_SHORT:
2005       evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2006       break;
2007     case T_INT:
2008     case T_FLOAT:
2009       evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2010       break;
2011     case T_LONG:
2012     case T_DOUBLE:
2013       evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2014       break;
2015     default:
2016       assert(false,"Should not reach here.");
2017       break;
2018   }
2019 }
2020 
2021 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2022   switch(typ) {
2023     case T_BYTE:
2024       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2025       break;
2026     case T_SHORT:
2027       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2028       break;
2029     case T_INT:
2030     case T_FLOAT:
2031       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2032       break;
2033     case T_LONG:
2034     case T_DOUBLE:
2035       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2036       break;
2037     default:
2038       assert(false,"Should not reach here.");
2039       break;
2040   }
2041 }
2042 
2043 //-------------------------------------------------------------------------------------------
2044 
2045 // IndexOf for constant substrings with size >= 8 chars
2046 // which don't need to be loaded through stack.
2047 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2048                                          Register cnt1, Register cnt2,
2049                                          int int_cnt2,  Register result,
2050                                          XMMRegister vec, Register tmp,
2051                                          int ae) {
2052   ShortBranchVerifier sbv(this);
2053   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2054   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2055 
2056   // This method uses the pcmpestri instruction with bound registers
2057   //   inputs:
2058   //     xmm - substring
2059   //     rax - substring length (elements count)
2060   //     mem - scanned string
2061   //     rdx - string length (elements count)
2062   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2063   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2064   //   outputs:
2065   //     rcx - matched index in string
2066   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2067   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2068   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2069   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2070   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2071 
2072   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2073         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2074         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2075 
2076   // Note, inline_string_indexOf() generates checks:
2077   // if (substr.count > string.count) return -1;
2078   // if (substr.count == 0) return 0;
2079   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2080 
2081   // Load substring.
2082   if (ae == StrIntrinsicNode::UL) {
2083     pmovzxbw(vec, Address(str2, 0));
2084   } else {
2085     movdqu(vec, Address(str2, 0));
2086   }
2087   movl(cnt2, int_cnt2);
2088   movptr(result, str1); // string addr
2089 
2090   if (int_cnt2 > stride) {
2091     jmpb(SCAN_TO_SUBSTR);
2092 
2093     // Reload substr for rescan, this code
2094     // is executed only for large substrings (> 8 chars)
2095     bind(RELOAD_SUBSTR);
2096     if (ae == StrIntrinsicNode::UL) {
2097       pmovzxbw(vec, Address(str2, 0));
2098     } else {
2099       movdqu(vec, Address(str2, 0));
2100     }
2101     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2102 
2103     bind(RELOAD_STR);
2104     // We came here after the beginning of the substring was
2105     // matched but the rest of it was not so we need to search
2106     // again. Start from the next element after the previous match.
2107 
2108     // cnt2 is number of substring reminding elements and
2109     // cnt1 is number of string reminding elements when cmp failed.
2110     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2111     subl(cnt1, cnt2);
2112     addl(cnt1, int_cnt2);
2113     movl(cnt2, int_cnt2); // Now restore cnt2
2114 
2115     decrementl(cnt1);     // Shift to next element
2116     cmpl(cnt1, cnt2);
2117     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2118 
2119     addptr(result, (1<<scale1));
2120 
2121   } // (int_cnt2 > 8)
2122 
2123   // Scan string for start of substr in 16-byte vectors
2124   bind(SCAN_TO_SUBSTR);
2125   pcmpestri(vec, Address(result, 0), mode);
2126   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2127   subl(cnt1, stride);
2128   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2129   cmpl(cnt1, cnt2);
2130   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2131   addptr(result, 16);
2132   jmpb(SCAN_TO_SUBSTR);
2133 
2134   // Found a potential substr
2135   bind(FOUND_CANDIDATE);
2136   // Matched whole vector if first element matched (tmp(rcx) == 0).
2137   if (int_cnt2 == stride) {
2138     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2139   } else { // int_cnt2 > 8
2140     jccb(Assembler::overflow, FOUND_SUBSTR);
2141   }
2142   // After pcmpestri tmp(rcx) contains matched element index
2143   // Compute start addr of substr
2144   lea(result, Address(result, tmp, scale1));
2145 
2146   // Make sure string is still long enough
2147   subl(cnt1, tmp);
2148   cmpl(cnt1, cnt2);
2149   if (int_cnt2 == stride) {
2150     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2151   } else { // int_cnt2 > 8
2152     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2153   }
2154   // Left less then substring.
2155 
2156   bind(RET_NOT_FOUND);
2157   movl(result, -1);
2158   jmp(EXIT);
2159 
2160   if (int_cnt2 > stride) {
2161     // This code is optimized for the case when whole substring
2162     // is matched if its head is matched.
2163     bind(MATCH_SUBSTR_HEAD);
2164     pcmpestri(vec, Address(result, 0), mode);
2165     // Reload only string if does not match
2166     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2167 
2168     Label CONT_SCAN_SUBSTR;
2169     // Compare the rest of substring (> 8 chars).
2170     bind(FOUND_SUBSTR);
2171     // First 8 chars are already matched.
2172     negptr(cnt2);
2173     addptr(cnt2, stride);
2174 
2175     bind(SCAN_SUBSTR);
2176     subl(cnt1, stride);
2177     cmpl(cnt2, -stride); // Do not read beyond substring
2178     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2179     // Back-up strings to avoid reading beyond substring:
2180     // cnt1 = cnt1 - cnt2 + 8
2181     addl(cnt1, cnt2); // cnt2 is negative
2182     addl(cnt1, stride);
2183     movl(cnt2, stride); negptr(cnt2);
2184     bind(CONT_SCAN_SUBSTR);
2185     if (int_cnt2 < (int)G) {
2186       int tail_off1 = int_cnt2<<scale1;
2187       int tail_off2 = int_cnt2<<scale2;
2188       if (ae == StrIntrinsicNode::UL) {
2189         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2190       } else {
2191         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2192       }
2193       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2194     } else {
2195       // calculate index in register to avoid integer overflow (int_cnt2*2)
2196       movl(tmp, int_cnt2);
2197       addptr(tmp, cnt2);
2198       if (ae == StrIntrinsicNode::UL) {
2199         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2200       } else {
2201         movdqu(vec, Address(str2, tmp, scale2, 0));
2202       }
2203       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2204     }
2205     // Need to reload strings pointers if not matched whole vector
2206     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2207     addptr(cnt2, stride);
2208     jcc(Assembler::negative, SCAN_SUBSTR);
2209     // Fall through if found full substring
2210 
2211   } // (int_cnt2 > 8)
2212 
2213   bind(RET_FOUND);
2214   // Found result if we matched full small substring.
2215   // Compute substr offset
2216   subptr(result, str1);
2217   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2218     shrl(result, 1); // index
2219   }
2220   bind(EXIT);
2221 
2222 } // string_indexofC8
2223 
2224 // Small strings are loaded through stack if they cross page boundary.
2225 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2226                                        Register cnt1, Register cnt2,
2227                                        int int_cnt2,  Register result,
2228                                        XMMRegister vec, Register tmp,
2229                                        int ae) {
2230   ShortBranchVerifier sbv(this);
2231   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2232   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2233 
2234   //
2235   // int_cnt2 is length of small (< 8 chars) constant substring
2236   // or (-1) for non constant substring in which case its length
2237   // is in cnt2 register.
2238   //
2239   // Note, inline_string_indexOf() generates checks:
2240   // if (substr.count > string.count) return -1;
2241   // if (substr.count == 0) return 0;
2242   //
2243   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2244   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2245   // This method uses the pcmpestri instruction with bound registers
2246   //   inputs:
2247   //     xmm - substring
2248   //     rax - substring length (elements count)
2249   //     mem - scanned string
2250   //     rdx - string length (elements count)
2251   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2252   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2253   //   outputs:
2254   //     rcx - matched index in string
2255   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2256   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2257   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2258   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2259 
2260   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2261         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2262         FOUND_CANDIDATE;
2263 
2264   { //========================================================
2265     // We don't know where these strings are located
2266     // and we can't read beyond them. Load them through stack.
2267     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2268 
2269     movptr(tmp, rsp); // save old SP
2270 
2271     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2272       if (int_cnt2 == (1>>scale2)) { // One byte
2273         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2274         load_unsigned_byte(result, Address(str2, 0));
2275         movdl(vec, result); // move 32 bits
2276       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2277         // Not enough header space in 32-bit VM: 12+3 = 15.
2278         movl(result, Address(str2, -1));
2279         shrl(result, 8);
2280         movdl(vec, result); // move 32 bits
2281       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2282         load_unsigned_short(result, Address(str2, 0));
2283         movdl(vec, result); // move 32 bits
2284       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2285         movdl(vec, Address(str2, 0)); // move 32 bits
2286       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2287         movq(vec, Address(str2, 0));  // move 64 bits
2288       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2289         // Array header size is 12 bytes in 32-bit VM
2290         // + 6 bytes for 3 chars == 18 bytes,
2291         // enough space to load vec and shift.
2292         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2293         if (ae == StrIntrinsicNode::UL) {
2294           int tail_off = int_cnt2-8;
2295           pmovzxbw(vec, Address(str2, tail_off));
2296           psrldq(vec, -2*tail_off);
2297         }
2298         else {
2299           int tail_off = int_cnt2*(1<<scale2);
2300           movdqu(vec, Address(str2, tail_off-16));
2301           psrldq(vec, 16-tail_off);
2302         }
2303       }
2304     } else { // not constant substring
2305       cmpl(cnt2, stride);
2306       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2307 
2308       // We can read beyond string if srt+16 does not cross page boundary
2309       // since heaps are aligned and mapped by pages.
2310       assert(os::vm_page_size() < (int)G, "default page should be small");
2311       movl(result, str2); // We need only low 32 bits
2312       andl(result, (os::vm_page_size()-1));
2313       cmpl(result, (os::vm_page_size()-16));
2314       jccb(Assembler::belowEqual, CHECK_STR);
2315 
2316       // Move small strings to stack to allow load 16 bytes into vec.
2317       subptr(rsp, 16);
2318       int stk_offset = wordSize-(1<<scale2);
2319       push(cnt2);
2320 
2321       bind(COPY_SUBSTR);
2322       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2323         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2324         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2325       } else if (ae == StrIntrinsicNode::UU) {
2326         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2327         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2328       }
2329       decrement(cnt2);
2330       jccb(Assembler::notZero, COPY_SUBSTR);
2331 
2332       pop(cnt2);
2333       movptr(str2, rsp);  // New substring address
2334     } // non constant
2335 
2336     bind(CHECK_STR);
2337     cmpl(cnt1, stride);
2338     jccb(Assembler::aboveEqual, BIG_STRINGS);
2339 
2340     // Check cross page boundary.
2341     movl(result, str1); // We need only low 32 bits
2342     andl(result, (os::vm_page_size()-1));
2343     cmpl(result, (os::vm_page_size()-16));
2344     jccb(Assembler::belowEqual, BIG_STRINGS);
2345 
2346     subptr(rsp, 16);
2347     int stk_offset = -(1<<scale1);
2348     if (int_cnt2 < 0) { // not constant
2349       push(cnt2);
2350       stk_offset += wordSize;
2351     }
2352     movl(cnt2, cnt1);
2353 
2354     bind(COPY_STR);
2355     if (ae == StrIntrinsicNode::LL) {
2356       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2357       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2358     } else {
2359       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2360       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2361     }
2362     decrement(cnt2);
2363     jccb(Assembler::notZero, COPY_STR);
2364 
2365     if (int_cnt2 < 0) { // not constant
2366       pop(cnt2);
2367     }
2368     movptr(str1, rsp);  // New string address
2369 
2370     bind(BIG_STRINGS);
2371     // Load substring.
2372     if (int_cnt2 < 0) { // -1
2373       if (ae == StrIntrinsicNode::UL) {
2374         pmovzxbw(vec, Address(str2, 0));
2375       } else {
2376         movdqu(vec, Address(str2, 0));
2377       }
2378       push(cnt2);       // substr count
2379       push(str2);       // substr addr
2380       push(str1);       // string addr
2381     } else {
2382       // Small (< 8 chars) constant substrings are loaded already.
2383       movl(cnt2, int_cnt2);
2384     }
2385     push(tmp);  // original SP
2386 
2387   } // Finished loading
2388 
2389   //========================================================
2390   // Start search
2391   //
2392 
2393   movptr(result, str1); // string addr
2394 
2395   if (int_cnt2  < 0) {  // Only for non constant substring
2396     jmpb(SCAN_TO_SUBSTR);
2397 
2398     // SP saved at sp+0
2399     // String saved at sp+1*wordSize
2400     // Substr saved at sp+2*wordSize
2401     // Substr count saved at sp+3*wordSize
2402 
2403     // Reload substr for rescan, this code
2404     // is executed only for large substrings (> 8 chars)
2405     bind(RELOAD_SUBSTR);
2406     movptr(str2, Address(rsp, 2*wordSize));
2407     movl(cnt2, Address(rsp, 3*wordSize));
2408     if (ae == StrIntrinsicNode::UL) {
2409       pmovzxbw(vec, Address(str2, 0));
2410     } else {
2411       movdqu(vec, Address(str2, 0));
2412     }
2413     // We came here after the beginning of the substring was
2414     // matched but the rest of it was not so we need to search
2415     // again. Start from the next element after the previous match.
2416     subptr(str1, result); // Restore counter
2417     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2418       shrl(str1, 1);
2419     }
2420     addl(cnt1, str1);
2421     decrementl(cnt1);   // Shift to next element
2422     cmpl(cnt1, cnt2);
2423     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2424 
2425     addptr(result, (1<<scale1));
2426   } // non constant
2427 
2428   // Scan string for start of substr in 16-byte vectors
2429   bind(SCAN_TO_SUBSTR);
2430   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2431   pcmpestri(vec, Address(result, 0), mode);
2432   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2433   subl(cnt1, stride);
2434   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2435   cmpl(cnt1, cnt2);
2436   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2437   addptr(result, 16);
2438 
2439   bind(ADJUST_STR);
2440   cmpl(cnt1, stride); // Do not read beyond string
2441   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2442   // Back-up string to avoid reading beyond string.
2443   lea(result, Address(result, cnt1, scale1, -16));
2444   movl(cnt1, stride);
2445   jmpb(SCAN_TO_SUBSTR);
2446 
2447   // Found a potential substr
2448   bind(FOUND_CANDIDATE);
2449   // After pcmpestri tmp(rcx) contains matched element index
2450 
2451   // Make sure string is still long enough
2452   subl(cnt1, tmp);
2453   cmpl(cnt1, cnt2);
2454   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2455   // Left less then substring.
2456 
2457   bind(RET_NOT_FOUND);
2458   movl(result, -1);
2459   jmp(CLEANUP);
2460 
2461   bind(FOUND_SUBSTR);
2462   // Compute start addr of substr
2463   lea(result, Address(result, tmp, scale1));
2464   if (int_cnt2 > 0) { // Constant substring
2465     // Repeat search for small substring (< 8 chars)
2466     // from new point without reloading substring.
2467     // Have to check that we don't read beyond string.
2468     cmpl(tmp, stride-int_cnt2);
2469     jccb(Assembler::greater, ADJUST_STR);
2470     // Fall through if matched whole substring.
2471   } else { // non constant
2472     assert(int_cnt2 == -1, "should be != 0");
2473 
2474     addl(tmp, cnt2);
2475     // Found result if we matched whole substring.
2476     cmpl(tmp, stride);
2477     jcc(Assembler::lessEqual, RET_FOUND);
2478 
2479     // Repeat search for small substring (<= 8 chars)
2480     // from new point 'str1' without reloading substring.
2481     cmpl(cnt2, stride);
2482     // Have to check that we don't read beyond string.
2483     jccb(Assembler::lessEqual, ADJUST_STR);
2484 
2485     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2486     // Compare the rest of substring (> 8 chars).
2487     movptr(str1, result);
2488 
2489     cmpl(tmp, cnt2);
2490     // First 8 chars are already matched.
2491     jccb(Assembler::equal, CHECK_NEXT);
2492 
2493     bind(SCAN_SUBSTR);
2494     pcmpestri(vec, Address(str1, 0), mode);
2495     // Need to reload strings pointers if not matched whole vector
2496     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2497 
2498     bind(CHECK_NEXT);
2499     subl(cnt2, stride);
2500     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2501     addptr(str1, 16);
2502     if (ae == StrIntrinsicNode::UL) {
2503       addptr(str2, 8);
2504     } else {
2505       addptr(str2, 16);
2506     }
2507     subl(cnt1, stride);
2508     cmpl(cnt2, stride); // Do not read beyond substring
2509     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2510     // Back-up strings to avoid reading beyond substring.
2511 
2512     if (ae == StrIntrinsicNode::UL) {
2513       lea(str2, Address(str2, cnt2, scale2, -8));
2514       lea(str1, Address(str1, cnt2, scale1, -16));
2515     } else {
2516       lea(str2, Address(str2, cnt2, scale2, -16));
2517       lea(str1, Address(str1, cnt2, scale1, -16));
2518     }
2519     subl(cnt1, cnt2);
2520     movl(cnt2, stride);
2521     addl(cnt1, stride);
2522     bind(CONT_SCAN_SUBSTR);
2523     if (ae == StrIntrinsicNode::UL) {
2524       pmovzxbw(vec, Address(str2, 0));
2525     } else {
2526       movdqu(vec, Address(str2, 0));
2527     }
2528     jmp(SCAN_SUBSTR);
2529 
2530     bind(RET_FOUND_LONG);
2531     movptr(str1, Address(rsp, wordSize));
2532   } // non constant
2533 
2534   bind(RET_FOUND);
2535   // Compute substr offset
2536   subptr(result, str1);
2537   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2538     shrl(result, 1); // index
2539   }
2540   bind(CLEANUP);
2541   pop(rsp); // restore SP
2542 
2543 } // string_indexof
2544 
2545 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2546                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2547   ShortBranchVerifier sbv(this);
2548   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2549 
2550   int stride = 8;
2551 
2552   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2553         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2554         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2555         FOUND_SEQ_CHAR, DONE_LABEL;
2556 
2557   movptr(result, str1);
2558   if (UseAVX >= 2) {
2559     cmpl(cnt1, stride);
2560     jcc(Assembler::less, SCAN_TO_CHAR);
2561     cmpl(cnt1, 2*stride);
2562     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2563     movdl(vec1, ch);
2564     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2565     vpxor(vec2, vec2);
2566     movl(tmp, cnt1);
2567     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2568     andl(cnt1,0x0000000F);  //tail count (in chars)
2569 
2570     bind(SCAN_TO_16_CHAR_LOOP);
2571     vmovdqu(vec3, Address(result, 0));
2572     vpcmpeqw(vec3, vec3, vec1, 1);
2573     vptest(vec2, vec3);
2574     jcc(Assembler::carryClear, FOUND_CHAR);
2575     addptr(result, 32);
2576     subl(tmp, 2*stride);
2577     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2578     jmp(SCAN_TO_8_CHAR);
2579     bind(SCAN_TO_8_CHAR_INIT);
2580     movdl(vec1, ch);
2581     pshuflw(vec1, vec1, 0x00);
2582     pshufd(vec1, vec1, 0);
2583     pxor(vec2, vec2);
2584   }
2585   bind(SCAN_TO_8_CHAR);
2586   cmpl(cnt1, stride);
2587   jcc(Assembler::less, SCAN_TO_CHAR);
2588   if (UseAVX < 2) {
2589     movdl(vec1, ch);
2590     pshuflw(vec1, vec1, 0x00);
2591     pshufd(vec1, vec1, 0);
2592     pxor(vec2, vec2);
2593   }
2594   movl(tmp, cnt1);
2595   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2596   andl(cnt1,0x00000007);  //tail count (in chars)
2597 
2598   bind(SCAN_TO_8_CHAR_LOOP);
2599   movdqu(vec3, Address(result, 0));
2600   pcmpeqw(vec3, vec1);
2601   ptest(vec2, vec3);
2602   jcc(Assembler::carryClear, FOUND_CHAR);
2603   addptr(result, 16);
2604   subl(tmp, stride);
2605   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2606   bind(SCAN_TO_CHAR);
2607   testl(cnt1, cnt1);
2608   jcc(Assembler::zero, RET_NOT_FOUND);
2609   bind(SCAN_TO_CHAR_LOOP);
2610   load_unsigned_short(tmp, Address(result, 0));
2611   cmpl(ch, tmp);
2612   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2613   addptr(result, 2);
2614   subl(cnt1, 1);
2615   jccb(Assembler::zero, RET_NOT_FOUND);
2616   jmp(SCAN_TO_CHAR_LOOP);
2617 
2618   bind(RET_NOT_FOUND);
2619   movl(result, -1);
2620   jmpb(DONE_LABEL);
2621 
2622   bind(FOUND_CHAR);
2623   if (UseAVX >= 2) {
2624     vpmovmskb(tmp, vec3);
2625   } else {
2626     pmovmskb(tmp, vec3);
2627   }
2628   bsfl(ch, tmp);
2629   addl(result, ch);
2630 
2631   bind(FOUND_SEQ_CHAR);
2632   subptr(result, str1);
2633   shrl(result, 1);
2634 
2635   bind(DONE_LABEL);
2636 } // string_indexof_char
2637 
2638 // helper function for string_compare
2639 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2640                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2641                                            Address::ScaleFactor scale2, Register index, int ae) {
2642   if (ae == StrIntrinsicNode::LL) {
2643     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2644     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2645   } else if (ae == StrIntrinsicNode::UU) {
2646     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2647     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2648   } else {
2649     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2650     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2651   }
2652 }
2653 
2654 // Compare strings, used for char[] and byte[].
2655 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2656                                        Register cnt1, Register cnt2, Register result,
2657                                        XMMRegister vec1, int ae) {
2658   ShortBranchVerifier sbv(this);
2659   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2660   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
2661   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2662   int stride2x2 = 0x40;
2663   Address::ScaleFactor scale = Address::no_scale;
2664   Address::ScaleFactor scale1 = Address::no_scale;
2665   Address::ScaleFactor scale2 = Address::no_scale;
2666 
2667   if (ae != StrIntrinsicNode::LL) {
2668     stride2x2 = 0x20;
2669   }
2670 
2671   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2672     shrl(cnt2, 1);
2673   }
2674   // Compute the minimum of the string lengths and the
2675   // difference of the string lengths (stack).
2676   // Do the conditional move stuff
2677   movl(result, cnt1);
2678   subl(cnt1, cnt2);
2679   push(cnt1);
2680   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
2681 
2682   // Is the minimum length zero?
2683   testl(cnt2, cnt2);
2684   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2685   if (ae == StrIntrinsicNode::LL) {
2686     // Load first bytes
2687     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
2688     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
2689   } else if (ae == StrIntrinsicNode::UU) {
2690     // Load first characters
2691     load_unsigned_short(result, Address(str1, 0));
2692     load_unsigned_short(cnt1, Address(str2, 0));
2693   } else {
2694     load_unsigned_byte(result, Address(str1, 0));
2695     load_unsigned_short(cnt1, Address(str2, 0));
2696   }
2697   subl(result, cnt1);
2698   jcc(Assembler::notZero,  POP_LABEL);
2699 
2700   if (ae == StrIntrinsicNode::UU) {
2701     // Divide length by 2 to get number of chars
2702     shrl(cnt2, 1);
2703   }
2704   cmpl(cnt2, 1);
2705   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2706 
2707   // Check if the strings start at the same location and setup scale and stride
2708   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2709     cmpptr(str1, str2);
2710     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2711     if (ae == StrIntrinsicNode::LL) {
2712       scale = Address::times_1;
2713       stride = 16;
2714     } else {
2715       scale = Address::times_2;
2716       stride = 8;
2717     }
2718   } else {
2719     scale1 = Address::times_1;
2720     scale2 = Address::times_2;
2721     // scale not used
2722     stride = 8;
2723   }
2724 
2725   if (UseAVX >= 2 && UseSSE42Intrinsics) {
2726     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
2727     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
2728     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
2729     Label COMPARE_TAIL_LONG;
2730     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
2731 
2732     int pcmpmask = 0x19;
2733     if (ae == StrIntrinsicNode::LL) {
2734       pcmpmask &= ~0x01;
2735     }
2736 
2737     // Setup to compare 16-chars (32-bytes) vectors,
2738     // start from first character again because it has aligned address.
2739     if (ae == StrIntrinsicNode::LL) {
2740       stride2 = 32;
2741     } else {
2742       stride2 = 16;
2743     }
2744     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2745       adr_stride = stride << scale;
2746     } else {
2747       adr_stride1 = 8;  //stride << scale1;
2748       adr_stride2 = 16; //stride << scale2;
2749     }
2750 
2751     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2752     // rax and rdx are used by pcmpestri as elements counters
2753     movl(result, cnt2);
2754     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
2755     jcc(Assembler::zero, COMPARE_TAIL_LONG);
2756 
2757     // fast path : compare first 2 8-char vectors.
2758     bind(COMPARE_16_CHARS);
2759     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2760       movdqu(vec1, Address(str1, 0));
2761     } else {
2762       pmovzxbw(vec1, Address(str1, 0));
2763     }
2764     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2765     jccb(Assembler::below, COMPARE_INDEX_CHAR);
2766 
2767     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2768       movdqu(vec1, Address(str1, adr_stride));
2769       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
2770     } else {
2771       pmovzxbw(vec1, Address(str1, adr_stride1));
2772       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
2773     }
2774     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
2775     addl(cnt1, stride);
2776 
2777     // Compare the characters at index in cnt1
2778     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
2779     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2780     subl(result, cnt2);
2781     jmp(POP_LABEL);
2782 
2783     // Setup the registers to start vector comparison loop
2784     bind(COMPARE_WIDE_VECTORS);
2785     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2786       lea(str1, Address(str1, result, scale));
2787       lea(str2, Address(str2, result, scale));
2788     } else {
2789       lea(str1, Address(str1, result, scale1));
2790       lea(str2, Address(str2, result, scale2));
2791     }
2792     subl(result, stride2);
2793     subl(cnt2, stride2);
2794     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
2795     negptr(result);
2796 
2797     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
2798     bind(COMPARE_WIDE_VECTORS_LOOP);
2799 
2800 #ifdef _LP64
2801     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2802       cmpl(cnt2, stride2x2);
2803       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2804       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
2805       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
2806 
2807       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2808       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2809         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
2810         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2811       } else {
2812         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
2813         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
2814       }
2815       kortestql(k7, k7);
2816       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
2817       addptr(result, stride2x2);  // update since we already compared at this addr
2818       subl(cnt2, stride2x2);      // and sub the size too
2819       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2820 
2821       vpxor(vec1, vec1);
2822       jmpb(COMPARE_WIDE_TAIL);
2823     }//if (VM_Version::supports_avx512vlbw())
2824 #endif // _LP64
2825 
2826 
2827     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2828     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2829       vmovdqu(vec1, Address(str1, result, scale));
2830       vpxor(vec1, Address(str2, result, scale));
2831     } else {
2832       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
2833       vpxor(vec1, Address(str2, result, scale2));
2834     }
2835     vptest(vec1, vec1);
2836     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
2837     addptr(result, stride2);
2838     subl(cnt2, stride2);
2839     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
2840     // clean upper bits of YMM registers
2841     vpxor(vec1, vec1);
2842 
2843     // compare wide vectors tail
2844     bind(COMPARE_WIDE_TAIL);
2845     testptr(result, result);
2846     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2847 
2848     movl(result, stride2);
2849     movl(cnt2, result);
2850     negptr(result);
2851     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2852 
2853     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
2854     bind(VECTOR_NOT_EQUAL);
2855     // clean upper bits of YMM registers
2856     vpxor(vec1, vec1);
2857     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2858       lea(str1, Address(str1, result, scale));
2859       lea(str2, Address(str2, result, scale));
2860     } else {
2861       lea(str1, Address(str1, result, scale1));
2862       lea(str2, Address(str2, result, scale2));
2863     }
2864     jmp(COMPARE_16_CHARS);
2865 
2866     // Compare tail chars, length between 1 to 15 chars
2867     bind(COMPARE_TAIL_LONG);
2868     movl(cnt2, result);
2869     cmpl(cnt2, stride);
2870     jcc(Assembler::less, COMPARE_SMALL_STR);
2871 
2872     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2873       movdqu(vec1, Address(str1, 0));
2874     } else {
2875       pmovzxbw(vec1, Address(str1, 0));
2876     }
2877     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2878     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2879     subptr(cnt2, stride);
2880     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2881     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2882       lea(str1, Address(str1, result, scale));
2883       lea(str2, Address(str2, result, scale));
2884     } else {
2885       lea(str1, Address(str1, result, scale1));
2886       lea(str2, Address(str2, result, scale2));
2887     }
2888     negptr(cnt2);
2889     jmpb(WHILE_HEAD_LABEL);
2890 
2891     bind(COMPARE_SMALL_STR);
2892   } else if (UseSSE42Intrinsics) {
2893     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2894     int pcmpmask = 0x19;
2895     // Setup to compare 8-char (16-byte) vectors,
2896     // start from first character again because it has aligned address.
2897     movl(result, cnt2);
2898     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2899     if (ae == StrIntrinsicNode::LL) {
2900       pcmpmask &= ~0x01;
2901     }
2902     jcc(Assembler::zero, COMPARE_TAIL);
2903     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2904       lea(str1, Address(str1, result, scale));
2905       lea(str2, Address(str2, result, scale));
2906     } else {
2907       lea(str1, Address(str1, result, scale1));
2908       lea(str2, Address(str2, result, scale2));
2909     }
2910     negptr(result);
2911 
2912     // pcmpestri
2913     //   inputs:
2914     //     vec1- substring
2915     //     rax - negative string length (elements count)
2916     //     mem - scanned string
2917     //     rdx - string length (elements count)
2918     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2919     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2920     //   outputs:
2921     //     rcx - first mismatched element index
2922     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2923 
2924     bind(COMPARE_WIDE_VECTORS);
2925     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2926       movdqu(vec1, Address(str1, result, scale));
2927       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2928     } else {
2929       pmovzxbw(vec1, Address(str1, result, scale1));
2930       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2931     }
2932     // After pcmpestri cnt1(rcx) contains mismatched element index
2933 
2934     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2935     addptr(result, stride);
2936     subptr(cnt2, stride);
2937     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2938 
2939     // compare wide vectors tail
2940     testptr(result, result);
2941     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2942 
2943     movl(cnt2, stride);
2944     movl(result, stride);
2945     negptr(result);
2946     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2947       movdqu(vec1, Address(str1, result, scale));
2948       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2949     } else {
2950       pmovzxbw(vec1, Address(str1, result, scale1));
2951       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2952     }
2953     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2954 
2955     // Mismatched characters in the vectors
2956     bind(VECTOR_NOT_EQUAL);
2957     addptr(cnt1, result);
2958     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2959     subl(result, cnt2);
2960     jmpb(POP_LABEL);
2961 
2962     bind(COMPARE_TAIL); // limit is zero
2963     movl(cnt2, result);
2964     // Fallthru to tail compare
2965   }
2966   // Shift str2 and str1 to the end of the arrays, negate min
2967   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2968     lea(str1, Address(str1, cnt2, scale));
2969     lea(str2, Address(str2, cnt2, scale));
2970   } else {
2971     lea(str1, Address(str1, cnt2, scale1));
2972     lea(str2, Address(str2, cnt2, scale2));
2973   }
2974   decrementl(cnt2);  // first character was compared already
2975   negptr(cnt2);
2976 
2977   // Compare the rest of the elements
2978   bind(WHILE_HEAD_LABEL);
2979   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2980   subl(result, cnt1);
2981   jccb(Assembler::notZero, POP_LABEL);
2982   increment(cnt2);
2983   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2984 
2985   // Strings are equal up to min length.  Return the length difference.
2986   bind(LENGTH_DIFF_LABEL);
2987   pop(result);
2988   if (ae == StrIntrinsicNode::UU) {
2989     // Divide diff by 2 to get number of chars
2990     sarl(result, 1);
2991   }
2992   jmpb(DONE_LABEL);
2993 
2994 #ifdef _LP64
2995   if (VM_Version::supports_avx512vlbw()) {
2996 
2997     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2998 
2999     kmovql(cnt1, k7);
3000     notq(cnt1);
3001     bsfq(cnt2, cnt1);
3002     if (ae != StrIntrinsicNode::LL) {
3003       // Divide diff by 2 to get number of chars
3004       sarl(cnt2, 1);
3005     }
3006     addq(result, cnt2);
3007     if (ae == StrIntrinsicNode::LL) {
3008       load_unsigned_byte(cnt1, Address(str2, result));
3009       load_unsigned_byte(result, Address(str1, result));
3010     } else if (ae == StrIntrinsicNode::UU) {
3011       load_unsigned_short(cnt1, Address(str2, result, scale));
3012       load_unsigned_short(result, Address(str1, result, scale));
3013     } else {
3014       load_unsigned_short(cnt1, Address(str2, result, scale2));
3015       load_unsigned_byte(result, Address(str1, result, scale1));
3016     }
3017     subl(result, cnt1);
3018     jmpb(POP_LABEL);
3019   }//if (VM_Version::supports_avx512vlbw())
3020 #endif // _LP64
3021 
3022   // Discard the stored length difference
3023   bind(POP_LABEL);
3024   pop(cnt1);
3025 
3026   // That's it
3027   bind(DONE_LABEL);
3028   if(ae == StrIntrinsicNode::UL) {
3029     negl(result);
3030   }
3031 
3032 }
3033 
3034 // Search for Non-ASCII character (Negative byte value) in a byte array,
3035 // return true if it has any and false otherwise.
3036 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3037 //   @HotSpotIntrinsicCandidate
3038 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3039 //     for (int i = off; i < off + len; i++) {
3040 //       if (ba[i] < 0) {
3041 //         return true;
3042 //       }
3043 //     }
3044 //     return false;
3045 //   }
3046 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3047   Register result, Register tmp1,
3048   XMMRegister vec1, XMMRegister vec2) {
3049   // rsi: byte array
3050   // rcx: len
3051   // rax: result
3052   ShortBranchVerifier sbv(this);
3053   assert_different_registers(ary1, len, result, tmp1);
3054   assert_different_registers(vec1, vec2);
3055   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3056 
3057   // len == 0
3058   testl(len, len);
3059   jcc(Assembler::zero, FALSE_LABEL);
3060 
3061   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3062     VM_Version::supports_avx512vlbw() &&
3063     VM_Version::supports_bmi2()) {
3064 
3065     Label test_64_loop, test_tail;
3066     Register tmp3_aliased = len;
3067 
3068     movl(tmp1, len);
3069     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3070 
3071     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3072     andl(len, ~(64 - 1));    // vector count (in chars)
3073     jccb(Assembler::zero, test_tail);
3074 
3075     lea(ary1, Address(ary1, len, Address::times_1));
3076     negptr(len);
3077 
3078     bind(test_64_loop);
3079     // Check whether our 64 elements of size byte contain negatives
3080     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3081     kortestql(k2, k2);
3082     jcc(Assembler::notZero, TRUE_LABEL);
3083 
3084     addptr(len, 64);
3085     jccb(Assembler::notZero, test_64_loop);
3086 
3087 
3088     bind(test_tail);
3089     // bail out when there is nothing to be done
3090     testl(tmp1, -1);
3091     jcc(Assembler::zero, FALSE_LABEL);
3092 
3093     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3094 #ifdef _LP64
3095     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3096     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3097     notq(tmp3_aliased);
3098     kmovql(k3, tmp3_aliased);
3099 #else
3100     Label k_init;
3101     jmp(k_init);
3102 
3103     // We could not read 64-bits from a general purpose register thus we move
3104     // data required to compose 64 1's to the instruction stream
3105     // We emit 64 byte wide series of elements from 0..63 which later on would
3106     // be used as a compare targets with tail count contained in tmp1 register.
3107     // Result would be a k register having tmp1 consecutive number or 1
3108     // counting from least significant bit.
3109     address tmp = pc();
3110     emit_int64(0x0706050403020100);
3111     emit_int64(0x0F0E0D0C0B0A0908);
3112     emit_int64(0x1716151413121110);
3113     emit_int64(0x1F1E1D1C1B1A1918);
3114     emit_int64(0x2726252423222120);
3115     emit_int64(0x2F2E2D2C2B2A2928);
3116     emit_int64(0x3736353433323130);
3117     emit_int64(0x3F3E3D3C3B3A3938);
3118 
3119     bind(k_init);
3120     lea(len, InternalAddress(tmp));
3121     // create mask to test for negative byte inside a vector
3122     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3123     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
3124 
3125 #endif
3126     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3127     ktestq(k2, k3);
3128     jcc(Assembler::notZero, TRUE_LABEL);
3129 
3130     jmp(FALSE_LABEL);
3131   } else {
3132     movl(result, len); // copy
3133 
3134     if (UseAVX >= 2 && UseSSE >= 2) {
3135       // With AVX2, use 32-byte vector compare
3136       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3137 
3138       // Compare 32-byte vectors
3139       andl(result, 0x0000001f);  //   tail count (in bytes)
3140       andl(len, 0xffffffe0);   // vector count (in bytes)
3141       jccb(Assembler::zero, COMPARE_TAIL);
3142 
3143       lea(ary1, Address(ary1, len, Address::times_1));
3144       negptr(len);
3145 
3146       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3147       movdl(vec2, tmp1);
3148       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3149 
3150       bind(COMPARE_WIDE_VECTORS);
3151       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3152       vptest(vec1, vec2);
3153       jccb(Assembler::notZero, TRUE_LABEL);
3154       addptr(len, 32);
3155       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3156 
3157       testl(result, result);
3158       jccb(Assembler::zero, FALSE_LABEL);
3159 
3160       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3161       vptest(vec1, vec2);
3162       jccb(Assembler::notZero, TRUE_LABEL);
3163       jmpb(FALSE_LABEL);
3164 
3165       bind(COMPARE_TAIL); // len is zero
3166       movl(len, result);
3167       // Fallthru to tail compare
3168     } else if (UseSSE42Intrinsics) {
3169       // With SSE4.2, use double quad vector compare
3170       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3171 
3172       // Compare 16-byte vectors
3173       andl(result, 0x0000000f);  //   tail count (in bytes)
3174       andl(len, 0xfffffff0);   // vector count (in bytes)
3175       jcc(Assembler::zero, COMPARE_TAIL);
3176 
3177       lea(ary1, Address(ary1, len, Address::times_1));
3178       negptr(len);
3179 
3180       movl(tmp1, 0x80808080);
3181       movdl(vec2, tmp1);
3182       pshufd(vec2, vec2, 0);
3183 
3184       bind(COMPARE_WIDE_VECTORS);
3185       movdqu(vec1, Address(ary1, len, Address::times_1));
3186       ptest(vec1, vec2);
3187       jcc(Assembler::notZero, TRUE_LABEL);
3188       addptr(len, 16);
3189       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3190 
3191       testl(result, result);
3192       jcc(Assembler::zero, FALSE_LABEL);
3193 
3194       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3195       ptest(vec1, vec2);
3196       jccb(Assembler::notZero, TRUE_LABEL);
3197       jmpb(FALSE_LABEL);
3198 
3199       bind(COMPARE_TAIL); // len is zero
3200       movl(len, result);
3201       // Fallthru to tail compare
3202     }
3203   }
3204   // Compare 4-byte vectors
3205   andl(len, 0xfffffffc); // vector count (in bytes)
3206   jccb(Assembler::zero, COMPARE_CHAR);
3207 
3208   lea(ary1, Address(ary1, len, Address::times_1));
3209   negptr(len);
3210 
3211   bind(COMPARE_VECTORS);
3212   movl(tmp1, Address(ary1, len, Address::times_1));
3213   andl(tmp1, 0x80808080);
3214   jccb(Assembler::notZero, TRUE_LABEL);
3215   addptr(len, 4);
3216   jcc(Assembler::notZero, COMPARE_VECTORS);
3217 
3218   // Compare trailing char (final 2 bytes), if any
3219   bind(COMPARE_CHAR);
3220   testl(result, 0x2);   // tail  char
3221   jccb(Assembler::zero, COMPARE_BYTE);
3222   load_unsigned_short(tmp1, Address(ary1, 0));
3223   andl(tmp1, 0x00008080);
3224   jccb(Assembler::notZero, TRUE_LABEL);
3225   subptr(result, 2);
3226   lea(ary1, Address(ary1, 2));
3227 
3228   bind(COMPARE_BYTE);
3229   testl(result, 0x1);   // tail  byte
3230   jccb(Assembler::zero, FALSE_LABEL);
3231   load_unsigned_byte(tmp1, Address(ary1, 0));
3232   andl(tmp1, 0x00000080);
3233   jccb(Assembler::notEqual, TRUE_LABEL);
3234   jmpb(FALSE_LABEL);
3235 
3236   bind(TRUE_LABEL);
3237   movl(result, 1);   // return true
3238   jmpb(DONE);
3239 
3240   bind(FALSE_LABEL);
3241   xorl(result, result); // return false
3242 
3243   // That's it
3244   bind(DONE);
3245   if (UseAVX >= 2 && UseSSE >= 2) {
3246     // clean upper bits of YMM registers
3247     vpxor(vec1, vec1);
3248     vpxor(vec2, vec2);
3249   }
3250 }
3251 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3252 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3253                                       Register limit, Register result, Register chr,
3254                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
3255   ShortBranchVerifier sbv(this);
3256   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3257 
3258   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3259   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3260 
3261   if (is_array_equ) {
3262     // Check the input args
3263     cmpoop(ary1, ary2);
3264     jcc(Assembler::equal, TRUE_LABEL);
3265 
3266     // Need additional checks for arrays_equals.
3267     testptr(ary1, ary1);
3268     jcc(Assembler::zero, FALSE_LABEL);
3269     testptr(ary2, ary2);
3270     jcc(Assembler::zero, FALSE_LABEL);
3271 
3272     // Check the lengths
3273     movl(limit, Address(ary1, length_offset));
3274     cmpl(limit, Address(ary2, length_offset));
3275     jcc(Assembler::notEqual, FALSE_LABEL);
3276   }
3277 
3278   // count == 0
3279   testl(limit, limit);
3280   jcc(Assembler::zero, TRUE_LABEL);
3281 
3282   if (is_array_equ) {
3283     // Load array address
3284     lea(ary1, Address(ary1, base_offset));
3285     lea(ary2, Address(ary2, base_offset));
3286   }
3287 
3288   if (is_array_equ && is_char) {
3289     // arrays_equals when used for char[].
3290     shll(limit, 1);      // byte count != 0
3291   }
3292   movl(result, limit); // copy
3293 
3294   if (UseAVX >= 2) {
3295     // With AVX2, use 32-byte vector compare
3296     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3297 
3298     // Compare 32-byte vectors
3299     andl(result, 0x0000001f);  //   tail count (in bytes)
3300     andl(limit, 0xffffffe0);   // vector count (in bytes)
3301     jcc(Assembler::zero, COMPARE_TAIL);
3302 
3303     lea(ary1, Address(ary1, limit, Address::times_1));
3304     lea(ary2, Address(ary2, limit, Address::times_1));
3305     negptr(limit);
3306 
3307 #ifdef _LP64
3308     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3309       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3310 
3311       cmpl(limit, -64);
3312       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3313 
3314       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3315 
3316       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3317       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3318       kortestql(k7, k7);
3319       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3320       addptr(limit, 64);  // update since we already compared at this addr
3321       cmpl(limit, -64);
3322       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3323 
3324       // At this point we may still need to compare -limit+result bytes.
3325       // We could execute the next two instruction and just continue via non-wide path:
3326       //  cmpl(limit, 0);
3327       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3328       // But since we stopped at the points ary{1,2}+limit which are
3329       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3330       // (|limit| <= 32 and result < 32),
3331       // we may just compare the last 64 bytes.
3332       //
3333       addptr(result, -64);   // it is safe, bc we just came from this area
3334       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3335       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3336       kortestql(k7, k7);
3337       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3338 
3339       jmp(TRUE_LABEL);
3340 
3341       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3342 
3343     }//if (VM_Version::supports_avx512vlbw())
3344 #endif //_LP64
3345     bind(COMPARE_WIDE_VECTORS);
3346     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3347     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3348     vpxor(vec1, vec2);
3349 
3350     vptest(vec1, vec1);
3351     jcc(Assembler::notZero, FALSE_LABEL);
3352     addptr(limit, 32);
3353     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3354 
3355     testl(result, result);
3356     jcc(Assembler::zero, TRUE_LABEL);
3357 
3358     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3359     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3360     vpxor(vec1, vec2);
3361 
3362     vptest(vec1, vec1);
3363     jccb(Assembler::notZero, FALSE_LABEL);
3364     jmpb(TRUE_LABEL);
3365 
3366     bind(COMPARE_TAIL); // limit is zero
3367     movl(limit, result);
3368     // Fallthru to tail compare
3369   } else if (UseSSE42Intrinsics) {
3370     // With SSE4.2, use double quad vector compare
3371     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3372 
3373     // Compare 16-byte vectors
3374     andl(result, 0x0000000f);  //   tail count (in bytes)
3375     andl(limit, 0xfffffff0);   // vector count (in bytes)
3376     jcc(Assembler::zero, COMPARE_TAIL);
3377 
3378     lea(ary1, Address(ary1, limit, Address::times_1));
3379     lea(ary2, Address(ary2, limit, Address::times_1));
3380     negptr(limit);
3381 
3382     bind(COMPARE_WIDE_VECTORS);
3383     movdqu(vec1, Address(ary1, limit, Address::times_1));
3384     movdqu(vec2, Address(ary2, limit, Address::times_1));
3385     pxor(vec1, vec2);
3386 
3387     ptest(vec1, vec1);
3388     jcc(Assembler::notZero, FALSE_LABEL);
3389     addptr(limit, 16);
3390     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3391 
3392     testl(result, result);
3393     jcc(Assembler::zero, TRUE_LABEL);
3394 
3395     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3396     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3397     pxor(vec1, vec2);
3398 
3399     ptest(vec1, vec1);
3400     jccb(Assembler::notZero, FALSE_LABEL);
3401     jmpb(TRUE_LABEL);
3402 
3403     bind(COMPARE_TAIL); // limit is zero
3404     movl(limit, result);
3405     // Fallthru to tail compare
3406   }
3407 
3408   // Compare 4-byte vectors
3409   andl(limit, 0xfffffffc); // vector count (in bytes)
3410   jccb(Assembler::zero, COMPARE_CHAR);
3411 
3412   lea(ary1, Address(ary1, limit, Address::times_1));
3413   lea(ary2, Address(ary2, limit, Address::times_1));
3414   negptr(limit);
3415 
3416   bind(COMPARE_VECTORS);
3417   movl(chr, Address(ary1, limit, Address::times_1));
3418   cmpl(chr, Address(ary2, limit, Address::times_1));
3419   jccb(Assembler::notEqual, FALSE_LABEL);
3420   addptr(limit, 4);
3421   jcc(Assembler::notZero, COMPARE_VECTORS);
3422 
3423   // Compare trailing char (final 2 bytes), if any
3424   bind(COMPARE_CHAR);
3425   testl(result, 0x2);   // tail  char
3426   jccb(Assembler::zero, COMPARE_BYTE);
3427   load_unsigned_short(chr, Address(ary1, 0));
3428   load_unsigned_short(limit, Address(ary2, 0));
3429   cmpl(chr, limit);
3430   jccb(Assembler::notEqual, FALSE_LABEL);
3431 
3432   if (is_array_equ && is_char) {
3433     bind(COMPARE_BYTE);
3434   } else {
3435     lea(ary1, Address(ary1, 2));
3436     lea(ary2, Address(ary2, 2));
3437 
3438     bind(COMPARE_BYTE);
3439     testl(result, 0x1);   // tail  byte
3440     jccb(Assembler::zero, TRUE_LABEL);
3441     load_unsigned_byte(chr, Address(ary1, 0));
3442     load_unsigned_byte(limit, Address(ary2, 0));
3443     cmpl(chr, limit);
3444     jccb(Assembler::notEqual, FALSE_LABEL);
3445   }
3446   bind(TRUE_LABEL);
3447   movl(result, 1);   // return true
3448   jmpb(DONE);
3449 
3450   bind(FALSE_LABEL);
3451   xorl(result, result); // return false
3452 
3453   // That's it
3454   bind(DONE);
3455   if (UseAVX >= 2) {
3456     // clean upper bits of YMM registers
3457     vpxor(vec1, vec1);
3458     vpxor(vec2, vec2);
3459   }
3460 }