1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  37   guarantee(PostLoopMultiversioning, "must be");
  38   Assembler::movl(dst, 1);
  39   Assembler::shlxl(dst, dst, src);
  40   Assembler::decl(dst);
  41   Assembler::kmovdl(k1, dst);
  42   Assembler::movl(dst, src);
  43 }
  44 
  45 void C2_MacroAssembler::restorevectmask() {
  46   guarantee(PostLoopMultiversioning, "must be");
  47   Assembler::knotwl(k1, k0);
  48 }
  49 
  50 #if INCLUDE_RTM_OPT
  51 
  52 // Update rtm_counters based on abort status
  53 // input: abort_status
  54 //        rtm_counters (RTMLockingCounters*)
  55 // flags are killed
  56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  57 
  58   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  59   if (PrintPreciseRTMLockingStatistics) {
  60     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  61       Label check_abort;
  62       testl(abort_status, (1<<i));
  63       jccb(Assembler::equal, check_abort);
  64       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  65       bind(check_abort);
  66     }
  67   }
  68 }
  69 
  70 // Branch if (random & (count-1) != 0), count is 2^n
  71 // tmp, scr and flags are killed
  72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  73   assert(tmp == rax, "");
  74   assert(scr == rdx, "");
  75   rdtsc(); // modifies EDX:EAX
  76   andptr(tmp, count-1);
  77   jccb(Assembler::notZero, brLabel);
  78 }
  79 
  80 // Perform abort ratio calculation, set no_rtm bit if high ratio
  81 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  82 // tmpReg, rtm_counters_Reg and flags are killed
  83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  84                                                     Register rtm_counters_Reg,
  85                                                     RTMLockingCounters* rtm_counters,
  86                                                     Metadata* method_data) {
  87   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
  88 
  89   if (RTMLockingCalculationDelay > 0) {
  90     // Delay calculation
  91     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
  92     testptr(tmpReg, tmpReg);
  93     jccb(Assembler::equal, L_done);
  94   }
  95   // Abort ratio calculation only if abort_count > RTMAbortThreshold
  96   //   Aborted transactions = abort_count * 100
  97   //   All transactions = total_count *  RTMTotalCountIncrRate
  98   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
  99 
 100   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 101   cmpptr(tmpReg, RTMAbortThreshold);
 102   jccb(Assembler::below, L_check_always_rtm2);
 103   imulptr(tmpReg, tmpReg, 100);
 104 
 105   Register scrReg = rtm_counters_Reg;
 106   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 107   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 108   imulptr(scrReg, scrReg, RTMAbortRatio);
 109   cmpptr(tmpReg, scrReg);
 110   jccb(Assembler::below, L_check_always_rtm1);
 111   if (method_data != NULL) {
 112     // set rtm_state to "no rtm" in MDO
 113     mov_metadata(tmpReg, method_data);
 114     lock();
 115     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 116   }
 117   jmpb(L_done);
 118   bind(L_check_always_rtm1);
 119   // Reload RTMLockingCounters* address
 120   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 121   bind(L_check_always_rtm2);
 122   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 123   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 124   jccb(Assembler::below, L_done);
 125   if (method_data != NULL) {
 126     // set rtm_state to "always rtm" in MDO
 127     mov_metadata(tmpReg, method_data);
 128     lock();
 129     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 130   }
 131   bind(L_done);
 132 }
 133 
 134 // Update counters and perform abort ratio calculation
 135 // input:  abort_status_Reg
 136 // rtm_counters_Reg, flags are killed
 137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 138                                       Register rtm_counters_Reg,
 139                                       RTMLockingCounters* rtm_counters,
 140                                       Metadata* method_data,
 141                                       bool profile_rtm) {
 142 
 143   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 144   // update rtm counters based on rax value at abort
 145   // reads abort_status_Reg, updates flags
 146   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 147   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 148   if (profile_rtm) {
 149     // Save abort status because abort_status_Reg is used by following code.
 150     if (RTMRetryCount > 0) {
 151       push(abort_status_Reg);
 152     }
 153     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 154     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 155     // restore abort status
 156     if (RTMRetryCount > 0) {
 157       pop(abort_status_Reg);
 158     }
 159   }
 160 }
 161 
 162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 163 // inputs: retry_count_Reg
 164 //       : abort_status_Reg
 165 // output: retry_count_Reg decremented by 1
 166 // flags are killed
 167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 168   Label doneRetry;
 169   assert(abort_status_Reg == rax, "");
 170   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 171   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 172   // if reason is in 0x6 and retry count != 0 then retry
 173   andptr(abort_status_Reg, 0x6);
 174   jccb(Assembler::zero, doneRetry);
 175   testl(retry_count_Reg, retry_count_Reg);
 176   jccb(Assembler::zero, doneRetry);
 177   pause();
 178   decrementl(retry_count_Reg);
 179   jmp(retryLabel);
 180   bind(doneRetry);
 181 }
 182 
 183 // Spin and retry if lock is busy,
 184 // inputs: box_Reg (monitor address)
 185 //       : retry_count_Reg
 186 // output: retry_count_Reg decremented by 1
 187 //       : clear z flag if retry count exceeded
 188 // tmp_Reg, scr_Reg, flags are killed
 189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 190                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 191   Label SpinLoop, SpinExit, doneRetry;
 192   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 193 
 194   testl(retry_count_Reg, retry_count_Reg);
 195   jccb(Assembler::zero, doneRetry);
 196   decrementl(retry_count_Reg);
 197   movptr(scr_Reg, RTMSpinLoopCount);
 198 
 199   bind(SpinLoop);
 200   pause();
 201   decrementl(scr_Reg);
 202   jccb(Assembler::lessEqual, SpinExit);
 203   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 204   testptr(tmp_Reg, tmp_Reg);
 205   jccb(Assembler::notZero, SpinLoop);
 206 
 207   bind(SpinExit);
 208   jmp(retryLabel);
 209   bind(doneRetry);
 210   incrementl(retry_count_Reg); // clear z flag
 211 }
 212 
 213 // Use RTM for normal stack locks
 214 // Input: objReg (object to lock)
 215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 216                                          Register retry_on_abort_count_Reg,
 217                                          RTMLockingCounters* stack_rtm_counters,
 218                                          Metadata* method_data, bool profile_rtm,
 219                                          Label& DONE_LABEL, Label& IsInflated) {
 220   assert(UseRTMForStackLocks, "why call this otherwise?");
 221   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 222   assert(tmpReg == rax, "");
 223   assert(scrReg == rdx, "");
 224   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 225 
 226   if (RTMRetryCount > 0) {
 227     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 228     bind(L_rtm_retry);
 229   }
 230   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 231   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 232   jcc(Assembler::notZero, IsInflated);
 233 
 234   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 235     Label L_noincrement;
 236     if (RTMTotalCountIncrRate > 1) {
 237       // tmpReg, scrReg and flags are killed
 238       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 239     }
 240     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 241     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 242     bind(L_noincrement);
 243   }
 244   xbegin(L_on_abort);
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 246   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 247   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 248   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 249 
 250   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 251   if (UseRTMXendForLockBusy) {
 252     xend();
 253     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 254     jmp(L_decrement_retry);
 255   }
 256   else {
 257     xabort(0);
 258   }
 259   bind(L_on_abort);
 260   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 261     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 262   }
 263   bind(L_decrement_retry);
 264   if (RTMRetryCount > 0) {
 265     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 266     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 267   }
 268 }
 269 
 270 // Use RTM for inflating locks
 271 // inputs: objReg (object to lock)
 272 //         boxReg (on-stack box address (displaced header location) - KILLED)
 273 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 275                                             Register scrReg, Register retry_on_busy_count_Reg,
 276                                             Register retry_on_abort_count_Reg,
 277                                             RTMLockingCounters* rtm_counters,
 278                                             Metadata* method_data, bool profile_rtm,
 279                                             Label& DONE_LABEL) {
 280   assert(UseRTMLocking, "why call this otherwise?");
 281   assert(tmpReg == rax, "");
 282   assert(scrReg == rdx, "");
 283   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 284   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 285 
 286   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 287   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 288   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 289 
 290   if (RTMRetryCount > 0) {
 291     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 292     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 293     bind(L_rtm_retry);
 294   }
 295   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 296     Label L_noincrement;
 297     if (RTMTotalCountIncrRate > 1) {
 298       // tmpReg, scrReg and flags are killed
 299       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 300     }
 301     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 302     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 303     bind(L_noincrement);
 304   }
 305   xbegin(L_on_abort);
 306   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 307   movptr(tmpReg, Address(tmpReg, owner_offset));
 308   testptr(tmpReg, tmpReg);
 309   jcc(Assembler::zero, DONE_LABEL);
 310   if (UseRTMXendForLockBusy) {
 311     xend();
 312     jmp(L_decrement_retry);
 313   }
 314   else {
 315     xabort(0);
 316   }
 317   bind(L_on_abort);
 318   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 319   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 320     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 321   }
 322   if (RTMRetryCount > 0) {
 323     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 324     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 325   }
 326 
 327   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 328   testptr(tmpReg, tmpReg) ;
 329   jccb(Assembler::notZero, L_decrement_retry) ;
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 333 #ifdef _LP64
 334   Register threadReg = r15_thread;
 335 #else
 336   get_thread(scrReg);
 337   Register threadReg = scrReg;
 338 #endif
 339   lock();
 340   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 341 
 342   if (RTMRetryCount > 0) {
 343     // success done else retry
 344     jccb(Assembler::equal, DONE_LABEL) ;
 345     bind(L_decrement_retry);
 346     // Spin and retry if lock is busy.
 347     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 348   }
 349   else {
 350     bind(L_decrement_retry);
 351   }
 352 }
 353 
 354 #endif //  INCLUDE_RTM_OPT
 355 
 356 // fast_lock and fast_unlock used by C2
 357 
 358 // Because the transitions from emitted code to the runtime
 359 // monitorenter/exit helper stubs are so slow it's critical that
 360 // we inline both the stack-locking fast path and the inflated fast path.
 361 //
 362 // See also: cmpFastLock and cmpFastUnlock.
 363 //
 364 // What follows is a specialized inline transliteration of the code
 365 // in enter() and exit(). If we're concerned about I$ bloat another
 366 // option would be to emit TrySlowEnter and TrySlowExit methods
 367 // at startup-time.  These methods would accept arguments as
 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 369 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 371 // In practice, however, the # of lock sites is bounded and is usually small.
 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 373 // if the processor uses simple bimodal branch predictors keyed by EIP
 374 // Since the helper routines would be called from multiple synchronization
 375 // sites.
 376 //
 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 379 // to those specialized methods.  That'd give us a mostly platform-independent
 380 // implementation that the JITs could optimize and inline at their pleasure.
 381 // Done correctly, the only time we'd need to cross to native could would be
 382 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 384 // (b) explicit barriers or fence operations.
 385 //
 386 // TODO:
 387 //
 388 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 389 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 390 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 391 //    the lock operators would typically be faster than reifying Self.
 392 //
 393 // *  Ideally I'd define the primitives as:
 394 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 395 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 396 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 397 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 398 //    Furthermore the register assignments are overconstrained, possibly resulting in
 399 //    sub-optimal code near the synchronization site.
 400 //
 401 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 402 //    Alternately, use a better sp-proximity test.
 403 //
 404 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 405 //    Either one is sufficient to uniquely identify a thread.
 406 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 407 //
 408 // *  Intrinsify notify() and notifyAll() for the common cases where the
 409 //    object is locked by the calling thread but the waitlist is empty.
 410 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 411 //
 412 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 413 //    But beware of excessive branch density on AMD Opterons.
 414 //
 415 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 416 //    or failure of the fast path.  If the fast path fails then we pass
 417 //    control to the slow path, typically in C.  In fast_lock and
 418 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 419 //    will emit a conditional branch immediately after the node.
 420 //    So we have branches to branches and lots of ICC.ZF games.
 421 //    Instead, it might be better to have C2 pass a "FailureLabel"
 422 //    into fast_lock and fast_unlock.  In the case of success, control
 423 //    will drop through the node.  ICC.ZF is undefined at exit.
 424 //    In the case of failure, the node will branch directly to the
 425 //    FailureLabel
 426 
 427 
 428 // obj: object to lock
 429 // box: on-stack box address (displaced header location) - KILLED
 430 // rax,: tmp -- KILLED
 431 // scr: tmp -- KILLED
 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 433                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 434                                  BiasedLockingCounters* counters,
 435                                  RTMLockingCounters* rtm_counters,
 436                                  RTMLockingCounters* stack_rtm_counters,
 437                                  Metadata* method_data,
 438                                  bool use_rtm, bool profile_rtm) {
 439   // Ensure the register assignments are disjoint
 440   assert(tmpReg == rax, "");
 441 
 442   if (use_rtm) {
 443     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 444   } else {
 445     assert(cx1Reg == noreg, "");
 446     assert(cx2Reg == noreg, "");
 447     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 448   }
 449 
 450   if (counters != NULL) {
 451     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 452   }
 453 
 454   // Possible cases that we'll encounter in fast_lock
 455   // ------------------------------------------------
 456   // * Inflated
 457   //    -- unlocked
 458   //    -- Locked
 459   //       = by self
 460   //       = by other
 461   // * biased
 462   //    -- by Self
 463   //    -- by other
 464   // * neutral
 465   // * stack-locked
 466   //    -- by self
 467   //       = sp-proximity test hits
 468   //       = sp-proximity test generates false-negative
 469   //    -- by other
 470   //
 471 
 472   Label IsInflated, DONE_LABEL;
 473 
 474   // it's stack-locked, biased or neutral
 475   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 476   // order to reduce the number of conditional branches in the most common cases.
 477   // Beware -- there's a subtle invariant that fetch of the markword
 478   // at [FETCH], below, will never observe a biased encoding (*101b).
 479   // If this invariant is not held we risk exclusion (safety) failure.
 480   if (UseBiasedLocking && !UseOptoBiasInlining) {
 481     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
 482   }
 483 
 484 #if INCLUDE_RTM_OPT
 485   if (UseRTMForStackLocks && use_rtm) {
 486     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 487                       stack_rtm_counters, method_data, profile_rtm,
 488                       DONE_LABEL, IsInflated);
 489   }
 490 #endif // INCLUDE_RTM_OPT
 491 
 492   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 493   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 494   jccb(Assembler::notZero, IsInflated);
 495 
 496   // Attempt stack-locking ...
 497   orptr (tmpReg, markWord::unlocked_value);
 498   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 499   lock();
 500   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 501   if (counters != NULL) {
 502     cond_inc32(Assembler::equal,
 503                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 504   }
 505   jcc(Assembler::equal, DONE_LABEL);           // Success
 506 
 507   // Recursive locking.
 508   // The object is stack-locked: markword contains stack pointer to BasicLock.
 509   // Locked by current thread if difference with current SP is less than one page.
 510   subptr(tmpReg, rsp);
 511   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 512   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 513   movptr(Address(boxReg, 0), tmpReg);
 514   if (counters != NULL) {
 515     cond_inc32(Assembler::equal,
 516                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 517   }
 518   jmp(DONE_LABEL);
 519 
 520   bind(IsInflated);
 521   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 522 
 523 #if INCLUDE_RTM_OPT
 524   // Use the same RTM locking code in 32- and 64-bit VM.
 525   if (use_rtm) {
 526     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 527                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 528   } else {
 529 #endif // INCLUDE_RTM_OPT
 530 
 531 #ifndef _LP64
 532   // The object is inflated.
 533 
 534   // boxReg refers to the on-stack BasicLock in the current frame.
 535   // We'd like to write:
 536   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 537   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 538   // additional latency as we have another ST in the store buffer that must drain.
 539 
 540   // avoid ST-before-CAS
 541   // register juggle because we need tmpReg for cmpxchgptr below
 542   movptr(scrReg, boxReg);
 543   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 544 
 545   // Optimistic form: consider XORL tmpReg,tmpReg
 546   movptr(tmpReg, NULL_WORD);
 547 
 548   // Appears unlocked - try to swing _owner from null to non-null.
 549   // Ideally, I'd manifest "Self" with get_thread and then attempt
 550   // to CAS the register containing Self into m->Owner.
 551   // But we don't have enough registers, so instead we can either try to CAS
 552   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 553   // we later store "Self" into m->Owner.  Transiently storing a stack address
 554   // (rsp or the address of the box) into  m->owner is harmless.
 555   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 556   lock();
 557   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 558   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 559   // If we weren't able to swing _owner from NULL to the BasicLock
 560   // then take the slow path.
 561   jccb  (Assembler::notZero, DONE_LABEL);
 562   // update _owner from BasicLock to thread
 563   get_thread (scrReg);                    // beware: clobbers ICCs
 564   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 565   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 566 
 567   // If the CAS fails we can either retry or pass control to the slow path.
 568   // We use the latter tactic.
 569   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 570   // If the CAS was successful ...
 571   //   Self has acquired the lock
 572   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 573   // Intentional fall-through into DONE_LABEL ...
 574 #else // _LP64
 575   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 576   movq(scrReg, tmpReg);
 577   xorq(tmpReg, tmpReg);
 578   lock();
 579   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 580   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 581   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 582   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 583   // Intentional fall-through into DONE_LABEL ...
 584   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 585 #endif // _LP64
 586 #if INCLUDE_RTM_OPT
 587   } // use_rtm()
 588 #endif
 589   // DONE_LABEL is a hot target - we'd really like to place it at the
 590   // start of cache line by padding with NOPs.
 591   // See the AMD and Intel software optimization manuals for the
 592   // most efficient "long" NOP encodings.
 593   // Unfortunately none of our alignment mechanisms suffice.
 594   bind(DONE_LABEL);
 595 
 596   // At DONE_LABEL the icc ZFlag is set as follows ...
 597   // fast_unlock uses the same protocol.
 598   // ZFlag == 1 -> Success
 599   // ZFlag == 0 -> Failure - force control through the slow path
 600 }
 601 
 602 // obj: object to unlock
 603 // box: box address (displaced header location), killed.  Must be EAX.
 604 // tmp: killed, cannot be obj nor box.
 605 //
 606 // Some commentary on balanced locking:
 607 //
 608 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 609 // Methods that don't have provably balanced locking are forced to run in the
 610 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 611 // The interpreter provides two properties:
 612 // I1:  At return-time the interpreter automatically and quietly unlocks any
 613 //      objects acquired the current activation (frame).  Recall that the
 614 //      interpreter maintains an on-stack list of locks currently held by
 615 //      a frame.
 616 // I2:  If a method attempts to unlock an object that is not held by the
 617 //      the frame the interpreter throws IMSX.
 618 //
 619 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 620 // B() doesn't have provably balanced locking so it runs in the interpreter.
 621 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 622 // is still locked by A().
 623 //
 624 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 625 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 626 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 627 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 628 // Arguably given that the spec legislates the JNI case as undefined our implementation
 629 // could reasonably *avoid* checking owner in fast_unlock().
 630 // In the interest of performance we elide m->Owner==Self check in unlock.
 631 // A perfectly viable alternative is to elide the owner check except when
 632 // Xcheck:jni is enabled.
 633 
 634 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 635   assert(boxReg == rax, "");
 636   assert_different_registers(objReg, boxReg, tmpReg);
 637 
 638   Label DONE_LABEL, Stacked, CheckSucc;
 639 
 640   // Critically, the biased locking test must have precedence over
 641   // and appear before the (box->dhw == 0) recursive stack-lock test.
 642   if (UseBiasedLocking && !UseOptoBiasInlining) {
 643     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 644   }
 645 
 646 #if INCLUDE_RTM_OPT
 647   if (UseRTMForStackLocks && use_rtm) {
 648     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 649     Label L_regular_unlock;
 650     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 651     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 652     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 653     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 654     xend();                                                           // otherwise end...
 655     jmp(DONE_LABEL);                                                  // ... and we're done
 656     bind(L_regular_unlock);
 657   }
 658 #endif
 659 
 660   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 661   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 662   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 663   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 664   jccb  (Assembler::zero, Stacked);
 665 
 666   // It's inflated.
 667 #if INCLUDE_RTM_OPT
 668   if (use_rtm) {
 669     Label L_regular_inflated_unlock;
 670     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 671     movptr(boxReg, Address(tmpReg, owner_offset));
 672     testptr(boxReg, boxReg);
 673     jccb(Assembler::notZero, L_regular_inflated_unlock);
 674     xend();
 675     jmpb(DONE_LABEL);
 676     bind(L_regular_inflated_unlock);
 677   }
 678 #endif
 679 
 680   // Despite our balanced locking property we still check that m->_owner == Self
 681   // as java routines or native JNI code called by this thread might
 682   // have released the lock.
 683   // Refer to the comments in synchronizer.cpp for how we might encode extra
 684   // state in _succ so we can avoid fetching EntryList|cxq.
 685   //
 686   // I'd like to add more cases in fast_lock() and fast_unlock() --
 687   // such as recursive enter and exit -- but we have to be wary of
 688   // I$ bloat, T$ effects and BP$ effects.
 689   //
 690   // If there's no contention try a 1-0 exit.  That is, exit without
 691   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 692   // we detect and recover from the race that the 1-0 exit admits.
 693   //
 694   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 695   // before it STs null into _owner, releasing the lock.  Updates
 696   // to data protected by the critical section must be visible before
 697   // we drop the lock (and thus before any other thread could acquire
 698   // the lock and observe the fields protected by the lock).
 699   // IA32's memory-model is SPO, so STs are ordered with respect to
 700   // each other and there's no need for an explicit barrier (fence).
 701   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 702 #ifndef _LP64
 703   get_thread (boxReg);
 704 
 705   // Note that we could employ various encoding schemes to reduce
 706   // the number of loads below (currently 4) to just 2 or 3.
 707   // Refer to the comments in synchronizer.cpp.
 708   // In practice the chain of fetches doesn't seem to impact performance, however.
 709   xorptr(boxReg, boxReg);
 710   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 711   jccb  (Assembler::notZero, DONE_LABEL);
 712   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 713   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 714   jccb  (Assembler::notZero, CheckSucc);
 715   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 716   jmpb  (DONE_LABEL);
 717 
 718   bind (Stacked);
 719   // It's not inflated and it's not recursively stack-locked and it's not biased.
 720   // It must be stack-locked.
 721   // Try to reset the header to displaced header.
 722   // The "box" value on the stack is stable, so we can reload
 723   // and be assured we observe the same value as above.
 724   movptr(tmpReg, Address(boxReg, 0));
 725   lock();
 726   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 727   // Intention fall-thru into DONE_LABEL
 728 
 729   // DONE_LABEL is a hot target - we'd really like to place it at the
 730   // start of cache line by padding with NOPs.
 731   // See the AMD and Intel software optimization manuals for the
 732   // most efficient "long" NOP encodings.
 733   // Unfortunately none of our alignment mechanisms suffice.
 734   bind (CheckSucc);
 735 #else // _LP64
 736   // It's inflated
 737   xorptr(boxReg, boxReg);
 738   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 739   jccb  (Assembler::notZero, DONE_LABEL);
 740   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 741   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 742   jccb  (Assembler::notZero, CheckSucc);
 743   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 744   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 745   jmpb  (DONE_LABEL);
 746 
 747   // Try to avoid passing control into the slow_path ...
 748   Label LSuccess, LGoSlowPath ;
 749   bind  (CheckSucc);
 750 
 751   // The following optional optimization can be elided if necessary
 752   // Effectively: if (succ == null) goto slow path
 753   // The code reduces the window for a race, however,
 754   // and thus benefits performance.
 755   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 756   jccb  (Assembler::zero, LGoSlowPath);
 757 
 758   xorptr(boxReg, boxReg);
 759   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 760   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 761 
 762   // Memory barrier/fence
 763   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 764   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 765   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 766   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 767   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 768   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 769   lock(); addl(Address(rsp, 0), 0);
 770 
 771   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 772   jccb  (Assembler::notZero, LSuccess);
 773 
 774   // Rare inopportune interleaving - race.
 775   // The successor vanished in the small window above.
 776   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 777   // We need to ensure progress and succession.
 778   // Try to reacquire the lock.
 779   // If that fails then the new owner is responsible for succession and this
 780   // thread needs to take no further action and can exit via the fast path (success).
 781   // If the re-acquire succeeds then pass control into the slow path.
 782   // As implemented, this latter mode is horrible because we generated more
 783   // coherence traffic on the lock *and* artifically extended the critical section
 784   // length while by virtue of passing control into the slow path.
 785 
 786   // box is really RAX -- the following CMPXCHG depends on that binding
 787   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 788   lock();
 789   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 790   // There's no successor so we tried to regrab the lock.
 791   // If that didn't work, then another thread grabbed the
 792   // lock so we're done (and exit was a success).
 793   jccb  (Assembler::notEqual, LSuccess);
 794   // Intentional fall-through into slow path
 795 
 796   bind  (LGoSlowPath);
 797   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 798   jmpb  (DONE_LABEL);
 799 
 800   bind  (LSuccess);
 801   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 802   jmpb  (DONE_LABEL);
 803 
 804   bind  (Stacked);
 805   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 806   lock();
 807   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 808 
 809 #endif
 810   bind(DONE_LABEL);
 811 }
 812 
 813 //-------------------------------------------------------------------------------------------
 814 // Generic instructions support for use in .ad files C2 code generation
 815 
 816 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 817   if (dst != src) {
 818     movdqu(dst, src);
 819   }
 820   if (opcode == Op_AbsVD) {
 821     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 822   } else {
 823     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 824     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 825   }
 826 }
 827 
 828 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 829   if (opcode == Op_AbsVD) {
 830     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 831   } else {
 832     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 833     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 834   }
 835 }
 836 
 837 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 838   if (dst != src) {
 839     movdqu(dst, src);
 840   }
 841   if (opcode == Op_AbsVF) {
 842     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 843   } else {
 844     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 845     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 846   }
 847 }
 848 
 849 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 850   if (opcode == Op_AbsVF) {
 851     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 852   } else {
 853     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 854     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 855   }
 856 }
 857 
 858 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
 859   if (sign) {
 860     pmovsxbw(dst, src);
 861   } else {
 862     pmovzxbw(dst, src);
 863   }
 864 }
 865 
 866 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
 867   if (sign) {
 868     vpmovsxbw(dst, src, vector_len);
 869   } else {
 870     vpmovzxbw(dst, src, vector_len);
 871   }
 872 }
 873 
 874 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
 875   if (opcode == Op_RShiftVI) {
 876     psrad(dst, src);
 877   } else if (opcode == Op_LShiftVI) {
 878     pslld(dst, src);
 879   } else {
 880     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 881     psrld(dst, src);
 882   }
 883 }
 884 
 885 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 886   if (opcode == Op_RShiftVI) {
 887     vpsrad(dst, nds, src, vector_len);
 888   } else if (opcode == Op_LShiftVI) {
 889     vpslld(dst, nds, src, vector_len);
 890   } else {
 891     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 892     vpsrld(dst, nds, src, vector_len);
 893   }
 894 }
 895 
 896 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
 897   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 898     psraw(dst, src);
 899   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 900     psllw(dst, src);
 901   } else {
 902     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 903     psrlw(dst, src);
 904   }
 905 }
 906 
 907 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 908   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 909     vpsraw(dst, nds, src, vector_len);
 910   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 911     vpsllw(dst, nds, src, vector_len);
 912   } else {
 913     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 914     vpsrlw(dst, nds, src, vector_len);
 915   }
 916 }
 917 
 918 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
 919   if (opcode == Op_RShiftVL) {
 920     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
 921   } else if (opcode == Op_LShiftVL) {
 922     psllq(dst, src);
 923   } else {
 924     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 925     psrlq(dst, src);
 926   }
 927 }
 928 
 929 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 930   if (opcode == Op_RShiftVL) {
 931     evpsraq(dst, nds, src, vector_len);
 932   } else if (opcode == Op_LShiftVL) {
 933     vpsllq(dst, nds, src, vector_len);
 934   } else {
 935     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 936     vpsrlq(dst, nds, src, vector_len);
 937   }
 938 }
 939 
 940 // Reductions for vectors of ints, longs, floats, and doubles.
 941 
 942 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
 943   int vector_len = Assembler::AVX_128bit;
 944 
 945   switch (opcode) {
 946     case Op_AndReductionV:  pand(dst, src); break;
 947     case Op_OrReductionV:   por (dst, src); break;
 948     case Op_XorReductionV:  pxor(dst, src); break;
 949 
 950     case Op_AddReductionVF: addss(dst, src); break;
 951     case Op_AddReductionVD: addsd(dst, src); break;
 952     case Op_AddReductionVI: paddd(dst, src); break;
 953     case Op_AddReductionVL: paddq(dst, src); break;
 954 
 955     case Op_MulReductionVF: mulss(dst, src); break;
 956     case Op_MulReductionVD: mulsd(dst, src); break;
 957     case Op_MulReductionVI: pmulld(dst, src); break;
 958     case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
 959 
 960     default: assert(false, "wrong opcode");
 961   }
 962 }
 963 
 964 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
 965   int vector_len = Assembler::AVX_256bit;
 966 
 967   switch (opcode) {
 968     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
 969     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
 970     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
 971 
 972     case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
 973     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
 974 
 975     case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
 976     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
 977 
 978     default: assert(false, "wrong opcode");
 979   }
 980 }
 981 
 982 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
 983                                   XMMRegister dst, XMMRegister src,
 984                                   XMMRegister vtmp1, XMMRegister vtmp2) {
 985   switch (opcode) {
 986     case Op_AddReductionVF:
 987     case Op_MulReductionVF:
 988       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
 989       break;
 990 
 991     case Op_AddReductionVD:
 992     case Op_MulReductionVD:
 993       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
 994       break;
 995 
 996     default: assert(false, "wrong opcode");
 997   }
 998 }
 999 
1000 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1001                                 Register dst, Register src1, XMMRegister src2,
1002                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1003   switch (vlen) {
1004     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1005     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1006     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1007     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1008 
1009     default: assert(false, "wrong vector length");
1010   }
1011 }
1012 
1013 #ifdef _LP64
1014 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1015                                 Register dst, Register src1, XMMRegister src2,
1016                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1017   switch (vlen) {
1018     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1019     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1020     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1021 
1022     default: assert(false, "wrong vector length");
1023   }
1024 }
1025 #endif // _LP64
1026 
1027 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1028   switch (vlen) {
1029     case 2:
1030       assert(vtmp2 == xnoreg, "");
1031       reduce2F(opcode, dst, src, vtmp1);
1032       break;
1033     case 4:
1034       assert(vtmp2 == xnoreg, "");
1035       reduce4F(opcode, dst, src, vtmp1);
1036       break;
1037     case 8:
1038       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1039       break;
1040     case 16:
1041       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1042       break;
1043     default: assert(false, "wrong vector length");
1044   }
1045 }
1046 
1047 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1048   switch (vlen) {
1049     case 2:
1050       assert(vtmp2 == xnoreg, "");
1051       reduce2D(opcode, dst, src, vtmp1);
1052       break;
1053     case 4:
1054       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1055       break;
1056     case 8:
1057       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1058       break;
1059     default: assert(false, "wrong vector length");
1060   }
1061 }
1062 
1063 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1064   if (opcode == Op_AddReductionVI) {
1065     if (vtmp1 != src2) {
1066       movdqu(vtmp1, src2);
1067     }
1068     phaddd(vtmp1, vtmp1);
1069   } else {
1070     pshufd(vtmp1, src2, 0x1);
1071     reduce_operation_128(opcode, vtmp1, src2);
1072   }
1073   movdl(vtmp2, src1);
1074   reduce_operation_128(opcode, vtmp1, vtmp2);
1075   movdl(dst, vtmp1);
1076 }
1077 
1078 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1079   if (opcode == Op_AddReductionVI) {
1080     if (vtmp1 != src2) {
1081       movdqu(vtmp1, src2);
1082     }
1083     phaddd(vtmp1, src2);
1084     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1085   } else {
1086     pshufd(vtmp2, src2, 0xE);
1087     reduce_operation_128(opcode, vtmp2, src2);
1088     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1089   }
1090 }
1091 
1092 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1093   if (opcode == Op_AddReductionVI) {
1094     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1095     vextracti128_high(vtmp2, vtmp1);
1096     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1097     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1098   } else {
1099     vextracti128_high(vtmp1, src2);
1100     reduce_operation_128(opcode, vtmp1, src2);
1101     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1102   }
1103 }
1104 
1105 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1106   vextracti64x4_high(vtmp2, src2);
1107   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1108   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1109 }
1110 
1111 #ifdef _LP64
1112 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1113   pshufd(vtmp2, src2, 0xE);
1114   reduce_operation_128(opcode, vtmp2, src2);
1115   movdq(vtmp1, src1);
1116   reduce_operation_128(opcode, vtmp1, vtmp2);
1117   movdq(dst, vtmp1);
1118 }
1119 
1120 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1121   vextracti128_high(vtmp1, src2);
1122   reduce_operation_128(opcode, vtmp1, src2);
1123   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1124 }
1125 
1126 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1127   vextracti64x4_high(vtmp2, src2);
1128   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1129   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1130 }
1131 #endif // _LP64
1132 
1133 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1134   reduce_operation_128(opcode, dst, src);
1135   pshufd(vtmp, src, 0x1);
1136   reduce_operation_128(opcode, dst, vtmp);
1137 }
1138 
1139 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1140   reduce2F(opcode, dst, src, vtmp);
1141   pshufd(vtmp, src, 0x2);
1142   reduce_operation_128(opcode, dst, vtmp);
1143   pshufd(vtmp, src, 0x3);
1144   reduce_operation_128(opcode, dst, vtmp);
1145 }
1146 
1147 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1148   reduce4F(opcode, dst, src, vtmp2);
1149   vextractf128_high(vtmp2, src);
1150   reduce4F(opcode, dst, vtmp2, vtmp1);
1151 }
1152 
1153 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1154   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1155   vextracti64x4_high(vtmp1, src);
1156   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1157 }
1158 
1159 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1160   reduce_operation_128(opcode, dst, src);
1161   pshufd(vtmp, src, 0xE);
1162   reduce_operation_128(opcode, dst, vtmp);
1163 }
1164 
1165 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1166   reduce2D(opcode, dst, src, vtmp2);
1167   vextractf128_high(vtmp2, src);
1168   reduce2D(opcode, dst, vtmp2, vtmp1);
1169 }
1170 
1171 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1172   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1173   vextracti64x4_high(vtmp1, src);
1174   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1175 }
1176 
1177 //-------------------------------------------------------------------------------------------
1178 
1179 // IndexOf for constant substrings with size >= 8 chars
1180 // which don't need to be loaded through stack.
1181 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
1182                                          Register cnt1, Register cnt2,
1183                                          int int_cnt2,  Register result,
1184                                          XMMRegister vec, Register tmp,
1185                                          int ae) {
1186   ShortBranchVerifier sbv(this);
1187   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1188   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1189 
1190   // This method uses the pcmpestri instruction with bound registers
1191   //   inputs:
1192   //     xmm - substring
1193   //     rax - substring length (elements count)
1194   //     mem - scanned string
1195   //     rdx - string length (elements count)
1196   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1197   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1198   //   outputs:
1199   //     rcx - matched index in string
1200   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1201   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1202   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1203   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1204   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1205 
1206   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
1207         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
1208         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
1209 
1210   // Note, inline_string_indexOf() generates checks:
1211   // if (substr.count > string.count) return -1;
1212   // if (substr.count == 0) return 0;
1213   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
1214 
1215   // Load substring.
1216   if (ae == StrIntrinsicNode::UL) {
1217     pmovzxbw(vec, Address(str2, 0));
1218   } else {
1219     movdqu(vec, Address(str2, 0));
1220   }
1221   movl(cnt2, int_cnt2);
1222   movptr(result, str1); // string addr
1223 
1224   if (int_cnt2 > stride) {
1225     jmpb(SCAN_TO_SUBSTR);
1226 
1227     // Reload substr for rescan, this code
1228     // is executed only for large substrings (> 8 chars)
1229     bind(RELOAD_SUBSTR);
1230     if (ae == StrIntrinsicNode::UL) {
1231       pmovzxbw(vec, Address(str2, 0));
1232     } else {
1233       movdqu(vec, Address(str2, 0));
1234     }
1235     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
1236 
1237     bind(RELOAD_STR);
1238     // We came here after the beginning of the substring was
1239     // matched but the rest of it was not so we need to search
1240     // again. Start from the next element after the previous match.
1241 
1242     // cnt2 is number of substring reminding elements and
1243     // cnt1 is number of string reminding elements when cmp failed.
1244     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
1245     subl(cnt1, cnt2);
1246     addl(cnt1, int_cnt2);
1247     movl(cnt2, int_cnt2); // Now restore cnt2
1248 
1249     decrementl(cnt1);     // Shift to next element
1250     cmpl(cnt1, cnt2);
1251     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1252 
1253     addptr(result, (1<<scale1));
1254 
1255   } // (int_cnt2 > 8)
1256 
1257   // Scan string for start of substr in 16-byte vectors
1258   bind(SCAN_TO_SUBSTR);
1259   pcmpestri(vec, Address(result, 0), mode);
1260   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1261   subl(cnt1, stride);
1262   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1263   cmpl(cnt1, cnt2);
1264   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1265   addptr(result, 16);
1266   jmpb(SCAN_TO_SUBSTR);
1267 
1268   // Found a potential substr
1269   bind(FOUND_CANDIDATE);
1270   // Matched whole vector if first element matched (tmp(rcx) == 0).
1271   if (int_cnt2 == stride) {
1272     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
1273   } else { // int_cnt2 > 8
1274     jccb(Assembler::overflow, FOUND_SUBSTR);
1275   }
1276   // After pcmpestri tmp(rcx) contains matched element index
1277   // Compute start addr of substr
1278   lea(result, Address(result, tmp, scale1));
1279 
1280   // Make sure string is still long enough
1281   subl(cnt1, tmp);
1282   cmpl(cnt1, cnt2);
1283   if (int_cnt2 == stride) {
1284     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1285   } else { // int_cnt2 > 8
1286     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
1287   }
1288   // Left less then substring.
1289 
1290   bind(RET_NOT_FOUND);
1291   movl(result, -1);
1292   jmp(EXIT);
1293 
1294   if (int_cnt2 > stride) {
1295     // This code is optimized for the case when whole substring
1296     // is matched if its head is matched.
1297     bind(MATCH_SUBSTR_HEAD);
1298     pcmpestri(vec, Address(result, 0), mode);
1299     // Reload only string if does not match
1300     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
1301 
1302     Label CONT_SCAN_SUBSTR;
1303     // Compare the rest of substring (> 8 chars).
1304     bind(FOUND_SUBSTR);
1305     // First 8 chars are already matched.
1306     negptr(cnt2);
1307     addptr(cnt2, stride);
1308 
1309     bind(SCAN_SUBSTR);
1310     subl(cnt1, stride);
1311     cmpl(cnt2, -stride); // Do not read beyond substring
1312     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
1313     // Back-up strings to avoid reading beyond substring:
1314     // cnt1 = cnt1 - cnt2 + 8
1315     addl(cnt1, cnt2); // cnt2 is negative
1316     addl(cnt1, stride);
1317     movl(cnt2, stride); negptr(cnt2);
1318     bind(CONT_SCAN_SUBSTR);
1319     if (int_cnt2 < (int)G) {
1320       int tail_off1 = int_cnt2<<scale1;
1321       int tail_off2 = int_cnt2<<scale2;
1322       if (ae == StrIntrinsicNode::UL) {
1323         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
1324       } else {
1325         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
1326       }
1327       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
1328     } else {
1329       // calculate index in register to avoid integer overflow (int_cnt2*2)
1330       movl(tmp, int_cnt2);
1331       addptr(tmp, cnt2);
1332       if (ae == StrIntrinsicNode::UL) {
1333         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
1334       } else {
1335         movdqu(vec, Address(str2, tmp, scale2, 0));
1336       }
1337       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
1338     }
1339     // Need to reload strings pointers if not matched whole vector
1340     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1341     addptr(cnt2, stride);
1342     jcc(Assembler::negative, SCAN_SUBSTR);
1343     // Fall through if found full substring
1344 
1345   } // (int_cnt2 > 8)
1346 
1347   bind(RET_FOUND);
1348   // Found result if we matched full small substring.
1349   // Compute substr offset
1350   subptr(result, str1);
1351   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1352     shrl(result, 1); // index
1353   }
1354   bind(EXIT);
1355 
1356 } // string_indexofC8
1357 
1358 // Small strings are loaded through stack if they cross page boundary.
1359 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
1360                                        Register cnt1, Register cnt2,
1361                                        int int_cnt2,  Register result,
1362                                        XMMRegister vec, Register tmp,
1363                                        int ae) {
1364   ShortBranchVerifier sbv(this);
1365   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1366   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1367 
1368   //
1369   // int_cnt2 is length of small (< 8 chars) constant substring
1370   // or (-1) for non constant substring in which case its length
1371   // is in cnt2 register.
1372   //
1373   // Note, inline_string_indexOf() generates checks:
1374   // if (substr.count > string.count) return -1;
1375   // if (substr.count == 0) return 0;
1376   //
1377   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1378   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
1379   // This method uses the pcmpestri instruction with bound registers
1380   //   inputs:
1381   //     xmm - substring
1382   //     rax - substring length (elements count)
1383   //     mem - scanned string
1384   //     rdx - string length (elements count)
1385   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1386   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1387   //   outputs:
1388   //     rcx - matched index in string
1389   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1390   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1391   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1392   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1393 
1394   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
1395         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
1396         FOUND_CANDIDATE;
1397 
1398   { //========================================================
1399     // We don't know where these strings are located
1400     // and we can't read beyond them. Load them through stack.
1401     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
1402 
1403     movptr(tmp, rsp); // save old SP
1404 
1405     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
1406       if (int_cnt2 == (1>>scale2)) { // One byte
1407         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
1408         load_unsigned_byte(result, Address(str2, 0));
1409         movdl(vec, result); // move 32 bits
1410       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
1411         // Not enough header space in 32-bit VM: 12+3 = 15.
1412         movl(result, Address(str2, -1));
1413         shrl(result, 8);
1414         movdl(vec, result); // move 32 bits
1415       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
1416         load_unsigned_short(result, Address(str2, 0));
1417         movdl(vec, result); // move 32 bits
1418       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
1419         movdl(vec, Address(str2, 0)); // move 32 bits
1420       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
1421         movq(vec, Address(str2, 0));  // move 64 bits
1422       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
1423         // Array header size is 12 bytes in 32-bit VM
1424         // + 6 bytes for 3 chars == 18 bytes,
1425         // enough space to load vec and shift.
1426         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
1427         if (ae == StrIntrinsicNode::UL) {
1428           int tail_off = int_cnt2-8;
1429           pmovzxbw(vec, Address(str2, tail_off));
1430           psrldq(vec, -2*tail_off);
1431         }
1432         else {
1433           int tail_off = int_cnt2*(1<<scale2);
1434           movdqu(vec, Address(str2, tail_off-16));
1435           psrldq(vec, 16-tail_off);
1436         }
1437       }
1438     } else { // not constant substring
1439       cmpl(cnt2, stride);
1440       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
1441 
1442       // We can read beyond string if srt+16 does not cross page boundary
1443       // since heaps are aligned and mapped by pages.
1444       assert(os::vm_page_size() < (int)G, "default page should be small");
1445       movl(result, str2); // We need only low 32 bits
1446       andl(result, (os::vm_page_size()-1));
1447       cmpl(result, (os::vm_page_size()-16));
1448       jccb(Assembler::belowEqual, CHECK_STR);
1449 
1450       // Move small strings to stack to allow load 16 bytes into vec.
1451       subptr(rsp, 16);
1452       int stk_offset = wordSize-(1<<scale2);
1453       push(cnt2);
1454 
1455       bind(COPY_SUBSTR);
1456       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
1457         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
1458         movb(Address(rsp, cnt2, scale2, stk_offset), result);
1459       } else if (ae == StrIntrinsicNode::UU) {
1460         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
1461         movw(Address(rsp, cnt2, scale2, stk_offset), result);
1462       }
1463       decrement(cnt2);
1464       jccb(Assembler::notZero, COPY_SUBSTR);
1465 
1466       pop(cnt2);
1467       movptr(str2, rsp);  // New substring address
1468     } // non constant
1469 
1470     bind(CHECK_STR);
1471     cmpl(cnt1, stride);
1472     jccb(Assembler::aboveEqual, BIG_STRINGS);
1473 
1474     // Check cross page boundary.
1475     movl(result, str1); // We need only low 32 bits
1476     andl(result, (os::vm_page_size()-1));
1477     cmpl(result, (os::vm_page_size()-16));
1478     jccb(Assembler::belowEqual, BIG_STRINGS);
1479 
1480     subptr(rsp, 16);
1481     int stk_offset = -(1<<scale1);
1482     if (int_cnt2 < 0) { // not constant
1483       push(cnt2);
1484       stk_offset += wordSize;
1485     }
1486     movl(cnt2, cnt1);
1487 
1488     bind(COPY_STR);
1489     if (ae == StrIntrinsicNode::LL) {
1490       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
1491       movb(Address(rsp, cnt2, scale1, stk_offset), result);
1492     } else {
1493       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
1494       movw(Address(rsp, cnt2, scale1, stk_offset), result);
1495     }
1496     decrement(cnt2);
1497     jccb(Assembler::notZero, COPY_STR);
1498 
1499     if (int_cnt2 < 0) { // not constant
1500       pop(cnt2);
1501     }
1502     movptr(str1, rsp);  // New string address
1503 
1504     bind(BIG_STRINGS);
1505     // Load substring.
1506     if (int_cnt2 < 0) { // -1
1507       if (ae == StrIntrinsicNode::UL) {
1508         pmovzxbw(vec, Address(str2, 0));
1509       } else {
1510         movdqu(vec, Address(str2, 0));
1511       }
1512       push(cnt2);       // substr count
1513       push(str2);       // substr addr
1514       push(str1);       // string addr
1515     } else {
1516       // Small (< 8 chars) constant substrings are loaded already.
1517       movl(cnt2, int_cnt2);
1518     }
1519     push(tmp);  // original SP
1520 
1521   } // Finished loading
1522 
1523   //========================================================
1524   // Start search
1525   //
1526 
1527   movptr(result, str1); // string addr
1528 
1529   if (int_cnt2  < 0) {  // Only for non constant substring
1530     jmpb(SCAN_TO_SUBSTR);
1531 
1532     // SP saved at sp+0
1533     // String saved at sp+1*wordSize
1534     // Substr saved at sp+2*wordSize
1535     // Substr count saved at sp+3*wordSize
1536 
1537     // Reload substr for rescan, this code
1538     // is executed only for large substrings (> 8 chars)
1539     bind(RELOAD_SUBSTR);
1540     movptr(str2, Address(rsp, 2*wordSize));
1541     movl(cnt2, Address(rsp, 3*wordSize));
1542     if (ae == StrIntrinsicNode::UL) {
1543       pmovzxbw(vec, Address(str2, 0));
1544     } else {
1545       movdqu(vec, Address(str2, 0));
1546     }
1547     // We came here after the beginning of the substring was
1548     // matched but the rest of it was not so we need to search
1549     // again. Start from the next element after the previous match.
1550     subptr(str1, result); // Restore counter
1551     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1552       shrl(str1, 1);
1553     }
1554     addl(cnt1, str1);
1555     decrementl(cnt1);   // Shift to next element
1556     cmpl(cnt1, cnt2);
1557     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1558 
1559     addptr(result, (1<<scale1));
1560   } // non constant
1561 
1562   // Scan string for start of substr in 16-byte vectors
1563   bind(SCAN_TO_SUBSTR);
1564   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1565   pcmpestri(vec, Address(result, 0), mode);
1566   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1567   subl(cnt1, stride);
1568   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1569   cmpl(cnt1, cnt2);
1570   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1571   addptr(result, 16);
1572 
1573   bind(ADJUST_STR);
1574   cmpl(cnt1, stride); // Do not read beyond string
1575   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1576   // Back-up string to avoid reading beyond string.
1577   lea(result, Address(result, cnt1, scale1, -16));
1578   movl(cnt1, stride);
1579   jmpb(SCAN_TO_SUBSTR);
1580 
1581   // Found a potential substr
1582   bind(FOUND_CANDIDATE);
1583   // After pcmpestri tmp(rcx) contains matched element index
1584 
1585   // Make sure string is still long enough
1586   subl(cnt1, tmp);
1587   cmpl(cnt1, cnt2);
1588   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
1589   // Left less then substring.
1590 
1591   bind(RET_NOT_FOUND);
1592   movl(result, -1);
1593   jmp(CLEANUP);
1594 
1595   bind(FOUND_SUBSTR);
1596   // Compute start addr of substr
1597   lea(result, Address(result, tmp, scale1));
1598   if (int_cnt2 > 0) { // Constant substring
1599     // Repeat search for small substring (< 8 chars)
1600     // from new point without reloading substring.
1601     // Have to check that we don't read beyond string.
1602     cmpl(tmp, stride-int_cnt2);
1603     jccb(Assembler::greater, ADJUST_STR);
1604     // Fall through if matched whole substring.
1605   } else { // non constant
1606     assert(int_cnt2 == -1, "should be != 0");
1607 
1608     addl(tmp, cnt2);
1609     // Found result if we matched whole substring.
1610     cmpl(tmp, stride);
1611     jcc(Assembler::lessEqual, RET_FOUND);
1612 
1613     // Repeat search for small substring (<= 8 chars)
1614     // from new point 'str1' without reloading substring.
1615     cmpl(cnt2, stride);
1616     // Have to check that we don't read beyond string.
1617     jccb(Assembler::lessEqual, ADJUST_STR);
1618 
1619     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
1620     // Compare the rest of substring (> 8 chars).
1621     movptr(str1, result);
1622 
1623     cmpl(tmp, cnt2);
1624     // First 8 chars are already matched.
1625     jccb(Assembler::equal, CHECK_NEXT);
1626 
1627     bind(SCAN_SUBSTR);
1628     pcmpestri(vec, Address(str1, 0), mode);
1629     // Need to reload strings pointers if not matched whole vector
1630     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1631 
1632     bind(CHECK_NEXT);
1633     subl(cnt2, stride);
1634     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
1635     addptr(str1, 16);
1636     if (ae == StrIntrinsicNode::UL) {
1637       addptr(str2, 8);
1638     } else {
1639       addptr(str2, 16);
1640     }
1641     subl(cnt1, stride);
1642     cmpl(cnt2, stride); // Do not read beyond substring
1643     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
1644     // Back-up strings to avoid reading beyond substring.
1645 
1646     if (ae == StrIntrinsicNode::UL) {
1647       lea(str2, Address(str2, cnt2, scale2, -8));
1648       lea(str1, Address(str1, cnt2, scale1, -16));
1649     } else {
1650       lea(str2, Address(str2, cnt2, scale2, -16));
1651       lea(str1, Address(str1, cnt2, scale1, -16));
1652     }
1653     subl(cnt1, cnt2);
1654     movl(cnt2, stride);
1655     addl(cnt1, stride);
1656     bind(CONT_SCAN_SUBSTR);
1657     if (ae == StrIntrinsicNode::UL) {
1658       pmovzxbw(vec, Address(str2, 0));
1659     } else {
1660       movdqu(vec, Address(str2, 0));
1661     }
1662     jmp(SCAN_SUBSTR);
1663 
1664     bind(RET_FOUND_LONG);
1665     movptr(str1, Address(rsp, wordSize));
1666   } // non constant
1667 
1668   bind(RET_FOUND);
1669   // Compute substr offset
1670   subptr(result, str1);
1671   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1672     shrl(result, 1); // index
1673   }
1674   bind(CLEANUP);
1675   pop(rsp); // restore SP
1676 
1677 } // string_indexof
1678 
1679 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1680                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
1681   ShortBranchVerifier sbv(this);
1682   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1683 
1684   int stride = 8;
1685 
1686   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
1687         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
1688         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
1689         FOUND_SEQ_CHAR, DONE_LABEL;
1690 
1691   movptr(result, str1);
1692   if (UseAVX >= 2) {
1693     cmpl(cnt1, stride);
1694     jcc(Assembler::less, SCAN_TO_CHAR);
1695     cmpl(cnt1, 2*stride);
1696     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
1697     movdl(vec1, ch);
1698     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
1699     vpxor(vec2, vec2);
1700     movl(tmp, cnt1);
1701     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
1702     andl(cnt1,0x0000000F);  //tail count (in chars)
1703 
1704     bind(SCAN_TO_16_CHAR_LOOP);
1705     vmovdqu(vec3, Address(result, 0));
1706     vpcmpeqw(vec3, vec3, vec1, 1);
1707     vptest(vec2, vec3);
1708     jcc(Assembler::carryClear, FOUND_CHAR);
1709     addptr(result, 32);
1710     subl(tmp, 2*stride);
1711     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
1712     jmp(SCAN_TO_8_CHAR);
1713     bind(SCAN_TO_8_CHAR_INIT);
1714     movdl(vec1, ch);
1715     pshuflw(vec1, vec1, 0x00);
1716     pshufd(vec1, vec1, 0);
1717     pxor(vec2, vec2);
1718   }
1719   bind(SCAN_TO_8_CHAR);
1720   cmpl(cnt1, stride);
1721   jcc(Assembler::less, SCAN_TO_CHAR);
1722   if (UseAVX < 2) {
1723     movdl(vec1, ch);
1724     pshuflw(vec1, vec1, 0x00);
1725     pshufd(vec1, vec1, 0);
1726     pxor(vec2, vec2);
1727   }
1728   movl(tmp, cnt1);
1729   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
1730   andl(cnt1,0x00000007);  //tail count (in chars)
1731 
1732   bind(SCAN_TO_8_CHAR_LOOP);
1733   movdqu(vec3, Address(result, 0));
1734   pcmpeqw(vec3, vec1);
1735   ptest(vec2, vec3);
1736   jcc(Assembler::carryClear, FOUND_CHAR);
1737   addptr(result, 16);
1738   subl(tmp, stride);
1739   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
1740   bind(SCAN_TO_CHAR);
1741   testl(cnt1, cnt1);
1742   jcc(Assembler::zero, RET_NOT_FOUND);
1743   bind(SCAN_TO_CHAR_LOOP);
1744   load_unsigned_short(tmp, Address(result, 0));
1745   cmpl(ch, tmp);
1746   jccb(Assembler::equal, FOUND_SEQ_CHAR);
1747   addptr(result, 2);
1748   subl(cnt1, 1);
1749   jccb(Assembler::zero, RET_NOT_FOUND);
1750   jmp(SCAN_TO_CHAR_LOOP);
1751 
1752   bind(RET_NOT_FOUND);
1753   movl(result, -1);
1754   jmpb(DONE_LABEL);
1755 
1756   bind(FOUND_CHAR);
1757   if (UseAVX >= 2) {
1758     vpmovmskb(tmp, vec3);
1759   } else {
1760     pmovmskb(tmp, vec3);
1761   }
1762   bsfl(ch, tmp);
1763   addl(result, ch);
1764 
1765   bind(FOUND_SEQ_CHAR);
1766   subptr(result, str1);
1767   shrl(result, 1);
1768 
1769   bind(DONE_LABEL);
1770 } // string_indexof_char
1771 
1772 // helper function for string_compare
1773 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1774                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
1775                                            Address::ScaleFactor scale2, Register index, int ae) {
1776   if (ae == StrIntrinsicNode::LL) {
1777     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
1778     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
1779   } else if (ae == StrIntrinsicNode::UU) {
1780     load_unsigned_short(elem1, Address(str1, index, scale, 0));
1781     load_unsigned_short(elem2, Address(str2, index, scale, 0));
1782   } else {
1783     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
1784     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
1785   }
1786 }
1787 
1788 // Compare strings, used for char[] and byte[].
1789 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1790                                        Register cnt1, Register cnt2, Register result,
1791                                        XMMRegister vec1, int ae) {
1792   ShortBranchVerifier sbv(this);
1793   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
1794   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
1795   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
1796   int stride2x2 = 0x40;
1797   Address::ScaleFactor scale = Address::no_scale;
1798   Address::ScaleFactor scale1 = Address::no_scale;
1799   Address::ScaleFactor scale2 = Address::no_scale;
1800 
1801   if (ae != StrIntrinsicNode::LL) {
1802     stride2x2 = 0x20;
1803   }
1804 
1805   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
1806     shrl(cnt2, 1);
1807   }
1808   // Compute the minimum of the string lengths and the
1809   // difference of the string lengths (stack).
1810   // Do the conditional move stuff
1811   movl(result, cnt1);
1812   subl(cnt1, cnt2);
1813   push(cnt1);
1814   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
1815 
1816   // Is the minimum length zero?
1817   testl(cnt2, cnt2);
1818   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1819   if (ae == StrIntrinsicNode::LL) {
1820     // Load first bytes
1821     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
1822     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
1823   } else if (ae == StrIntrinsicNode::UU) {
1824     // Load first characters
1825     load_unsigned_short(result, Address(str1, 0));
1826     load_unsigned_short(cnt1, Address(str2, 0));
1827   } else {
1828     load_unsigned_byte(result, Address(str1, 0));
1829     load_unsigned_short(cnt1, Address(str2, 0));
1830   }
1831   subl(result, cnt1);
1832   jcc(Assembler::notZero,  POP_LABEL);
1833 
1834   if (ae == StrIntrinsicNode::UU) {
1835     // Divide length by 2 to get number of chars
1836     shrl(cnt2, 1);
1837   }
1838   cmpl(cnt2, 1);
1839   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1840 
1841   // Check if the strings start at the same location and setup scale and stride
1842   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1843     cmpptr(str1, str2);
1844     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1845     if (ae == StrIntrinsicNode::LL) {
1846       scale = Address::times_1;
1847       stride = 16;
1848     } else {
1849       scale = Address::times_2;
1850       stride = 8;
1851     }
1852   } else {
1853     scale1 = Address::times_1;
1854     scale2 = Address::times_2;
1855     // scale not used
1856     stride = 8;
1857   }
1858 
1859   if (UseAVX >= 2 && UseSSE42Intrinsics) {
1860     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
1861     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
1862     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
1863     Label COMPARE_TAIL_LONG;
1864     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
1865 
1866     int pcmpmask = 0x19;
1867     if (ae == StrIntrinsicNode::LL) {
1868       pcmpmask &= ~0x01;
1869     }
1870 
1871     // Setup to compare 16-chars (32-bytes) vectors,
1872     // start from first character again because it has aligned address.
1873     if (ae == StrIntrinsicNode::LL) {
1874       stride2 = 32;
1875     } else {
1876       stride2 = 16;
1877     }
1878     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1879       adr_stride = stride << scale;
1880     } else {
1881       adr_stride1 = 8;  //stride << scale1;
1882       adr_stride2 = 16; //stride << scale2;
1883     }
1884 
1885     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
1886     // rax and rdx are used by pcmpestri as elements counters
1887     movl(result, cnt2);
1888     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
1889     jcc(Assembler::zero, COMPARE_TAIL_LONG);
1890 
1891     // fast path : compare first 2 8-char vectors.
1892     bind(COMPARE_16_CHARS);
1893     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1894       movdqu(vec1, Address(str1, 0));
1895     } else {
1896       pmovzxbw(vec1, Address(str1, 0));
1897     }
1898     pcmpestri(vec1, Address(str2, 0), pcmpmask);
1899     jccb(Assembler::below, COMPARE_INDEX_CHAR);
1900 
1901     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1902       movdqu(vec1, Address(str1, adr_stride));
1903       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
1904     } else {
1905       pmovzxbw(vec1, Address(str1, adr_stride1));
1906       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
1907     }
1908     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
1909     addl(cnt1, stride);
1910 
1911     // Compare the characters at index in cnt1
1912     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
1913     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
1914     subl(result, cnt2);
1915     jmp(POP_LABEL);
1916 
1917     // Setup the registers to start vector comparison loop
1918     bind(COMPARE_WIDE_VECTORS);
1919     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1920       lea(str1, Address(str1, result, scale));
1921       lea(str2, Address(str2, result, scale));
1922     } else {
1923       lea(str1, Address(str1, result, scale1));
1924       lea(str2, Address(str2, result, scale2));
1925     }
1926     subl(result, stride2);
1927     subl(cnt2, stride2);
1928     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
1929     negptr(result);
1930 
1931     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
1932     bind(COMPARE_WIDE_VECTORS_LOOP);
1933 
1934 #ifdef _LP64
1935     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
1936       cmpl(cnt2, stride2x2);
1937       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
1938       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
1939       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
1940 
1941       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
1942       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1943         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
1944         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1945       } else {
1946         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
1947         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1948       }
1949       kortestql(k7, k7);
1950       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
1951       addptr(result, stride2x2);  // update since we already compared at this addr
1952       subl(cnt2, stride2x2);      // and sub the size too
1953       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
1954 
1955       vpxor(vec1, vec1);
1956       jmpb(COMPARE_WIDE_TAIL);
1957     }//if (VM_Version::supports_avx512vlbw())
1958 #endif // _LP64
1959 
1960 
1961     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1962     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1963       vmovdqu(vec1, Address(str1, result, scale));
1964       vpxor(vec1, Address(str2, result, scale));
1965     } else {
1966       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
1967       vpxor(vec1, Address(str2, result, scale2));
1968     }
1969     vptest(vec1, vec1);
1970     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
1971     addptr(result, stride2);
1972     subl(cnt2, stride2);
1973     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
1974     // clean upper bits of YMM registers
1975     vpxor(vec1, vec1);
1976 
1977     // compare wide vectors tail
1978     bind(COMPARE_WIDE_TAIL);
1979     testptr(result, result);
1980     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1981 
1982     movl(result, stride2);
1983     movl(cnt2, result);
1984     negptr(result);
1985     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1986 
1987     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
1988     bind(VECTOR_NOT_EQUAL);
1989     // clean upper bits of YMM registers
1990     vpxor(vec1, vec1);
1991     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1992       lea(str1, Address(str1, result, scale));
1993       lea(str2, Address(str2, result, scale));
1994     } else {
1995       lea(str1, Address(str1, result, scale1));
1996       lea(str2, Address(str2, result, scale2));
1997     }
1998     jmp(COMPARE_16_CHARS);
1999 
2000     // Compare tail chars, length between 1 to 15 chars
2001     bind(COMPARE_TAIL_LONG);
2002     movl(cnt2, result);
2003     cmpl(cnt2, stride);
2004     jcc(Assembler::less, COMPARE_SMALL_STR);
2005 
2006     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2007       movdqu(vec1, Address(str1, 0));
2008     } else {
2009       pmovzxbw(vec1, Address(str1, 0));
2010     }
2011     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2012     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2013     subptr(cnt2, stride);
2014     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2015     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2016       lea(str1, Address(str1, result, scale));
2017       lea(str2, Address(str2, result, scale));
2018     } else {
2019       lea(str1, Address(str1, result, scale1));
2020       lea(str2, Address(str2, result, scale2));
2021     }
2022     negptr(cnt2);
2023     jmpb(WHILE_HEAD_LABEL);
2024 
2025     bind(COMPARE_SMALL_STR);
2026   } else if (UseSSE42Intrinsics) {
2027     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2028     int pcmpmask = 0x19;
2029     // Setup to compare 8-char (16-byte) vectors,
2030     // start from first character again because it has aligned address.
2031     movl(result, cnt2);
2032     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2033     if (ae == StrIntrinsicNode::LL) {
2034       pcmpmask &= ~0x01;
2035     }
2036     jcc(Assembler::zero, COMPARE_TAIL);
2037     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2038       lea(str1, Address(str1, result, scale));
2039       lea(str2, Address(str2, result, scale));
2040     } else {
2041       lea(str1, Address(str1, result, scale1));
2042       lea(str2, Address(str2, result, scale2));
2043     }
2044     negptr(result);
2045 
2046     // pcmpestri
2047     //   inputs:
2048     //     vec1- substring
2049     //     rax - negative string length (elements count)
2050     //     mem - scanned string
2051     //     rdx - string length (elements count)
2052     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2053     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2054     //   outputs:
2055     //     rcx - first mismatched element index
2056     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2057 
2058     bind(COMPARE_WIDE_VECTORS);
2059     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2060       movdqu(vec1, Address(str1, result, scale));
2061       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2062     } else {
2063       pmovzxbw(vec1, Address(str1, result, scale1));
2064       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2065     }
2066     // After pcmpestri cnt1(rcx) contains mismatched element index
2067 
2068     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2069     addptr(result, stride);
2070     subptr(cnt2, stride);
2071     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2072 
2073     // compare wide vectors tail
2074     testptr(result, result);
2075     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2076 
2077     movl(cnt2, stride);
2078     movl(result, stride);
2079     negptr(result);
2080     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2081       movdqu(vec1, Address(str1, result, scale));
2082       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2083     } else {
2084       pmovzxbw(vec1, Address(str1, result, scale1));
2085       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2086     }
2087     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2088 
2089     // Mismatched characters in the vectors
2090     bind(VECTOR_NOT_EQUAL);
2091     addptr(cnt1, result);
2092     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2093     subl(result, cnt2);
2094     jmpb(POP_LABEL);
2095 
2096     bind(COMPARE_TAIL); // limit is zero
2097     movl(cnt2, result);
2098     // Fallthru to tail compare
2099   }
2100   // Shift str2 and str1 to the end of the arrays, negate min
2101   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2102     lea(str1, Address(str1, cnt2, scale));
2103     lea(str2, Address(str2, cnt2, scale));
2104   } else {
2105     lea(str1, Address(str1, cnt2, scale1));
2106     lea(str2, Address(str2, cnt2, scale2));
2107   }
2108   decrementl(cnt2);  // first character was compared already
2109   negptr(cnt2);
2110 
2111   // Compare the rest of the elements
2112   bind(WHILE_HEAD_LABEL);
2113   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2114   subl(result, cnt1);
2115   jccb(Assembler::notZero, POP_LABEL);
2116   increment(cnt2);
2117   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2118 
2119   // Strings are equal up to min length.  Return the length difference.
2120   bind(LENGTH_DIFF_LABEL);
2121   pop(result);
2122   if (ae == StrIntrinsicNode::UU) {
2123     // Divide diff by 2 to get number of chars
2124     sarl(result, 1);
2125   }
2126   jmpb(DONE_LABEL);
2127 
2128 #ifdef _LP64
2129   if (VM_Version::supports_avx512vlbw()) {
2130 
2131     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2132 
2133     kmovql(cnt1, k7);
2134     notq(cnt1);
2135     bsfq(cnt2, cnt1);
2136     if (ae != StrIntrinsicNode::LL) {
2137       // Divide diff by 2 to get number of chars
2138       sarl(cnt2, 1);
2139     }
2140     addq(result, cnt2);
2141     if (ae == StrIntrinsicNode::LL) {
2142       load_unsigned_byte(cnt1, Address(str2, result));
2143       load_unsigned_byte(result, Address(str1, result));
2144     } else if (ae == StrIntrinsicNode::UU) {
2145       load_unsigned_short(cnt1, Address(str2, result, scale));
2146       load_unsigned_short(result, Address(str1, result, scale));
2147     } else {
2148       load_unsigned_short(cnt1, Address(str2, result, scale2));
2149       load_unsigned_byte(result, Address(str1, result, scale1));
2150     }
2151     subl(result, cnt1);
2152     jmpb(POP_LABEL);
2153   }//if (VM_Version::supports_avx512vlbw())
2154 #endif // _LP64
2155 
2156   // Discard the stored length difference
2157   bind(POP_LABEL);
2158   pop(cnt1);
2159 
2160   // That's it
2161   bind(DONE_LABEL);
2162   if(ae == StrIntrinsicNode::UL) {
2163     negl(result);
2164   }
2165 
2166 }
2167 
2168 // Search for Non-ASCII character (Negative byte value) in a byte array,
2169 // return true if it has any and false otherwise.
2170 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
2171 //   @HotSpotIntrinsicCandidate
2172 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
2173 //     for (int i = off; i < off + len; i++) {
2174 //       if (ba[i] < 0) {
2175 //         return true;
2176 //       }
2177 //     }
2178 //     return false;
2179 //   }
2180 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
2181   Register result, Register tmp1,
2182   XMMRegister vec1, XMMRegister vec2) {
2183   // rsi: byte array
2184   // rcx: len
2185   // rax: result
2186   ShortBranchVerifier sbv(this);
2187   assert_different_registers(ary1, len, result, tmp1);
2188   assert_different_registers(vec1, vec2);
2189   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
2190 
2191   // len == 0
2192   testl(len, len);
2193   jcc(Assembler::zero, FALSE_LABEL);
2194 
2195   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
2196     VM_Version::supports_avx512vlbw() &&
2197     VM_Version::supports_bmi2()) {
2198 
2199     Label test_64_loop, test_tail;
2200     Register tmp3_aliased = len;
2201 
2202     movl(tmp1, len);
2203     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
2204 
2205     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
2206     andl(len, ~(64 - 1));    // vector count (in chars)
2207     jccb(Assembler::zero, test_tail);
2208 
2209     lea(ary1, Address(ary1, len, Address::times_1));
2210     negptr(len);
2211 
2212     bind(test_64_loop);
2213     // Check whether our 64 elements of size byte contain negatives
2214     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
2215     kortestql(k2, k2);
2216     jcc(Assembler::notZero, TRUE_LABEL);
2217 
2218     addptr(len, 64);
2219     jccb(Assembler::notZero, test_64_loop);
2220 
2221 
2222     bind(test_tail);
2223     // bail out when there is nothing to be done
2224     testl(tmp1, -1);
2225     jcc(Assembler::zero, FALSE_LABEL);
2226 
2227     // ~(~0 << len) applied up to two times (for 32-bit scenario)
2228 #ifdef _LP64
2229     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
2230     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
2231     notq(tmp3_aliased);
2232     kmovql(k3, tmp3_aliased);
2233 #else
2234     Label k_init;
2235     jmp(k_init);
2236 
2237     // We could not read 64-bits from a general purpose register thus we move
2238     // data required to compose 64 1's to the instruction stream
2239     // We emit 64 byte wide series of elements from 0..63 which later on would
2240     // be used as a compare targets with tail count contained in tmp1 register.
2241     // Result would be a k register having tmp1 consecutive number or 1
2242     // counting from least significant bit.
2243     address tmp = pc();
2244     emit_int64(0x0706050403020100);
2245     emit_int64(0x0F0E0D0C0B0A0908);
2246     emit_int64(0x1716151413121110);
2247     emit_int64(0x1F1E1D1C1B1A1918);
2248     emit_int64(0x2726252423222120);
2249     emit_int64(0x2F2E2D2C2B2A2928);
2250     emit_int64(0x3736353433323130);
2251     emit_int64(0x3F3E3D3C3B3A3938);
2252 
2253     bind(k_init);
2254     lea(len, InternalAddress(tmp));
2255     // create mask to test for negative byte inside a vector
2256     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
2257     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
2258 
2259 #endif
2260     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
2261     ktestq(k2, k3);
2262     jcc(Assembler::notZero, TRUE_LABEL);
2263 
2264     jmp(FALSE_LABEL);
2265   } else {
2266     movl(result, len); // copy
2267 
2268     if (UseAVX >= 2 && UseSSE >= 2) {
2269       // With AVX2, use 32-byte vector compare
2270       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2271 
2272       // Compare 32-byte vectors
2273       andl(result, 0x0000001f);  //   tail count (in bytes)
2274       andl(len, 0xffffffe0);   // vector count (in bytes)
2275       jccb(Assembler::zero, COMPARE_TAIL);
2276 
2277       lea(ary1, Address(ary1, len, Address::times_1));
2278       negptr(len);
2279 
2280       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
2281       movdl(vec2, tmp1);
2282       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
2283 
2284       bind(COMPARE_WIDE_VECTORS);
2285       vmovdqu(vec1, Address(ary1, len, Address::times_1));
2286       vptest(vec1, vec2);
2287       jccb(Assembler::notZero, TRUE_LABEL);
2288       addptr(len, 32);
2289       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2290 
2291       testl(result, result);
2292       jccb(Assembler::zero, FALSE_LABEL);
2293 
2294       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2295       vptest(vec1, vec2);
2296       jccb(Assembler::notZero, TRUE_LABEL);
2297       jmpb(FALSE_LABEL);
2298 
2299       bind(COMPARE_TAIL); // len is zero
2300       movl(len, result);
2301       // Fallthru to tail compare
2302     } else if (UseSSE42Intrinsics) {
2303       // With SSE4.2, use double quad vector compare
2304       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2305 
2306       // Compare 16-byte vectors
2307       andl(result, 0x0000000f);  //   tail count (in bytes)
2308       andl(len, 0xfffffff0);   // vector count (in bytes)
2309       jcc(Assembler::zero, COMPARE_TAIL);
2310 
2311       lea(ary1, Address(ary1, len, Address::times_1));
2312       negptr(len);
2313 
2314       movl(tmp1, 0x80808080);
2315       movdl(vec2, tmp1);
2316       pshufd(vec2, vec2, 0);
2317 
2318       bind(COMPARE_WIDE_VECTORS);
2319       movdqu(vec1, Address(ary1, len, Address::times_1));
2320       ptest(vec1, vec2);
2321       jcc(Assembler::notZero, TRUE_LABEL);
2322       addptr(len, 16);
2323       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2324 
2325       testl(result, result);
2326       jcc(Assembler::zero, FALSE_LABEL);
2327 
2328       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2329       ptest(vec1, vec2);
2330       jccb(Assembler::notZero, TRUE_LABEL);
2331       jmpb(FALSE_LABEL);
2332 
2333       bind(COMPARE_TAIL); // len is zero
2334       movl(len, result);
2335       // Fallthru to tail compare
2336     }
2337   }
2338   // Compare 4-byte vectors
2339   andl(len, 0xfffffffc); // vector count (in bytes)
2340   jccb(Assembler::zero, COMPARE_CHAR);
2341 
2342   lea(ary1, Address(ary1, len, Address::times_1));
2343   negptr(len);
2344 
2345   bind(COMPARE_VECTORS);
2346   movl(tmp1, Address(ary1, len, Address::times_1));
2347   andl(tmp1, 0x80808080);
2348   jccb(Assembler::notZero, TRUE_LABEL);
2349   addptr(len, 4);
2350   jcc(Assembler::notZero, COMPARE_VECTORS);
2351 
2352   // Compare trailing char (final 2 bytes), if any
2353   bind(COMPARE_CHAR);
2354   testl(result, 0x2);   // tail  char
2355   jccb(Assembler::zero, COMPARE_BYTE);
2356   load_unsigned_short(tmp1, Address(ary1, 0));
2357   andl(tmp1, 0x00008080);
2358   jccb(Assembler::notZero, TRUE_LABEL);
2359   subptr(result, 2);
2360   lea(ary1, Address(ary1, 2));
2361 
2362   bind(COMPARE_BYTE);
2363   testl(result, 0x1);   // tail  byte
2364   jccb(Assembler::zero, FALSE_LABEL);
2365   load_unsigned_byte(tmp1, Address(ary1, 0));
2366   andl(tmp1, 0x00000080);
2367   jccb(Assembler::notEqual, TRUE_LABEL);
2368   jmpb(FALSE_LABEL);
2369 
2370   bind(TRUE_LABEL);
2371   movl(result, 1);   // return true
2372   jmpb(DONE);
2373 
2374   bind(FALSE_LABEL);
2375   xorl(result, result); // return false
2376 
2377   // That's it
2378   bind(DONE);
2379   if (UseAVX >= 2 && UseSSE >= 2) {
2380     // clean upper bits of YMM registers
2381     vpxor(vec1, vec1);
2382     vpxor(vec2, vec2);
2383   }
2384 }
2385 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
2386 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
2387                                       Register limit, Register result, Register chr,
2388                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
2389   ShortBranchVerifier sbv(this);
2390   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
2391 
2392   int length_offset  = arrayOopDesc::length_offset_in_bytes();
2393   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
2394 
2395   if (is_array_equ) {
2396     // Check the input args
2397     cmpoop(ary1, ary2);
2398     jcc(Assembler::equal, TRUE_LABEL);
2399 
2400     // Need additional checks for arrays_equals.
2401     testptr(ary1, ary1);
2402     jcc(Assembler::zero, FALSE_LABEL);
2403     testptr(ary2, ary2);
2404     jcc(Assembler::zero, FALSE_LABEL);
2405 
2406     // Check the lengths
2407     movl(limit, Address(ary1, length_offset));
2408     cmpl(limit, Address(ary2, length_offset));
2409     jcc(Assembler::notEqual, FALSE_LABEL);
2410   }
2411 
2412   // count == 0
2413   testl(limit, limit);
2414   jcc(Assembler::zero, TRUE_LABEL);
2415 
2416   if (is_array_equ) {
2417     // Load array address
2418     lea(ary1, Address(ary1, base_offset));
2419     lea(ary2, Address(ary2, base_offset));
2420   }
2421 
2422   if (is_array_equ && is_char) {
2423     // arrays_equals when used for char[].
2424     shll(limit, 1);      // byte count != 0
2425   }
2426   movl(result, limit); // copy
2427 
2428   if (UseAVX >= 2) {
2429     // With AVX2, use 32-byte vector compare
2430     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2431 
2432     // Compare 32-byte vectors
2433     andl(result, 0x0000001f);  //   tail count (in bytes)
2434     andl(limit, 0xffffffe0);   // vector count (in bytes)
2435     jcc(Assembler::zero, COMPARE_TAIL);
2436 
2437     lea(ary1, Address(ary1, limit, Address::times_1));
2438     lea(ary2, Address(ary2, limit, Address::times_1));
2439     negptr(limit);
2440 
2441 #ifdef _LP64
2442     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2443       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
2444 
2445       cmpl(limit, -64);
2446       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2447 
2448       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2449 
2450       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
2451       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
2452       kortestql(k7, k7);
2453       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2454       addptr(limit, 64);  // update since we already compared at this addr
2455       cmpl(limit, -64);
2456       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2457 
2458       // At this point we may still need to compare -limit+result bytes.
2459       // We could execute the next two instruction and just continue via non-wide path:
2460       //  cmpl(limit, 0);
2461       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
2462       // But since we stopped at the points ary{1,2}+limit which are
2463       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
2464       // (|limit| <= 32 and result < 32),
2465       // we may just compare the last 64 bytes.
2466       //
2467       addptr(result, -64);   // it is safe, bc we just came from this area
2468       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
2469       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
2470       kortestql(k7, k7);
2471       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2472 
2473       jmp(TRUE_LABEL);
2474 
2475       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2476 
2477     }//if (VM_Version::supports_avx512vlbw())
2478 #endif //_LP64
2479     bind(COMPARE_WIDE_VECTORS);
2480     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
2481     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
2482     vpxor(vec1, vec2);
2483 
2484     vptest(vec1, vec1);
2485     jcc(Assembler::notZero, FALSE_LABEL);
2486     addptr(limit, 32);
2487     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2488 
2489     testl(result, result);
2490     jcc(Assembler::zero, TRUE_LABEL);
2491 
2492     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2493     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
2494     vpxor(vec1, vec2);
2495 
2496     vptest(vec1, vec1);
2497     jccb(Assembler::notZero, FALSE_LABEL);
2498     jmpb(TRUE_LABEL);
2499 
2500     bind(COMPARE_TAIL); // limit is zero
2501     movl(limit, result);
2502     // Fallthru to tail compare
2503   } else if (UseSSE42Intrinsics) {
2504     // With SSE4.2, use double quad vector compare
2505     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2506 
2507     // Compare 16-byte vectors
2508     andl(result, 0x0000000f);  //   tail count (in bytes)
2509     andl(limit, 0xfffffff0);   // vector count (in bytes)
2510     jcc(Assembler::zero, COMPARE_TAIL);
2511 
2512     lea(ary1, Address(ary1, limit, Address::times_1));
2513     lea(ary2, Address(ary2, limit, Address::times_1));
2514     negptr(limit);
2515 
2516     bind(COMPARE_WIDE_VECTORS);
2517     movdqu(vec1, Address(ary1, limit, Address::times_1));
2518     movdqu(vec2, Address(ary2, limit, Address::times_1));
2519     pxor(vec1, vec2);
2520 
2521     ptest(vec1, vec1);
2522     jcc(Assembler::notZero, FALSE_LABEL);
2523     addptr(limit, 16);
2524     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2525 
2526     testl(result, result);
2527     jcc(Assembler::zero, TRUE_LABEL);
2528 
2529     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2530     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
2531     pxor(vec1, vec2);
2532 
2533     ptest(vec1, vec1);
2534     jccb(Assembler::notZero, FALSE_LABEL);
2535     jmpb(TRUE_LABEL);
2536 
2537     bind(COMPARE_TAIL); // limit is zero
2538     movl(limit, result);
2539     // Fallthru to tail compare
2540   }
2541 
2542   // Compare 4-byte vectors
2543   andl(limit, 0xfffffffc); // vector count (in bytes)
2544   jccb(Assembler::zero, COMPARE_CHAR);
2545 
2546   lea(ary1, Address(ary1, limit, Address::times_1));
2547   lea(ary2, Address(ary2, limit, Address::times_1));
2548   negptr(limit);
2549 
2550   bind(COMPARE_VECTORS);
2551   movl(chr, Address(ary1, limit, Address::times_1));
2552   cmpl(chr, Address(ary2, limit, Address::times_1));
2553   jccb(Assembler::notEqual, FALSE_LABEL);
2554   addptr(limit, 4);
2555   jcc(Assembler::notZero, COMPARE_VECTORS);
2556 
2557   // Compare trailing char (final 2 bytes), if any
2558   bind(COMPARE_CHAR);
2559   testl(result, 0x2);   // tail  char
2560   jccb(Assembler::zero, COMPARE_BYTE);
2561   load_unsigned_short(chr, Address(ary1, 0));
2562   load_unsigned_short(limit, Address(ary2, 0));
2563   cmpl(chr, limit);
2564   jccb(Assembler::notEqual, FALSE_LABEL);
2565 
2566   if (is_array_equ && is_char) {
2567     bind(COMPARE_BYTE);
2568   } else {
2569     lea(ary1, Address(ary1, 2));
2570     lea(ary2, Address(ary2, 2));
2571 
2572     bind(COMPARE_BYTE);
2573     testl(result, 0x1);   // tail  byte
2574     jccb(Assembler::zero, TRUE_LABEL);
2575     load_unsigned_byte(chr, Address(ary1, 0));
2576     load_unsigned_byte(limit, Address(ary2, 0));
2577     cmpl(chr, limit);
2578     jccb(Assembler::notEqual, FALSE_LABEL);
2579   }
2580   bind(TRUE_LABEL);
2581   movl(result, 1);   // return true
2582   jmpb(DONE);
2583 
2584   bind(FALSE_LABEL);
2585   xorl(result, result); // return false
2586 
2587   // That's it
2588   bind(DONE);
2589   if (UseAVX >= 2) {
2590     // clean upper bits of YMM registers
2591     vpxor(vec1, vec1);
2592     vpxor(vec2, vec2);
2593   }
2594 }