1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "oops/methodData.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/opcodes.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/objectMonitor.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
  37   guarantee(PostLoopMultiversioning, "must be");
  38   Assembler::movl(dst, 1);
  39   Assembler::shlxl(dst, dst, src);
  40   Assembler::decl(dst);
  41   Assembler::kmovdl(k1, dst);
  42   Assembler::movl(dst, src);
  43 }
  44 
  45 void C2_MacroAssembler::restorevectmask() {
  46   guarantee(PostLoopMultiversioning, "must be");
  47   Assembler::knotwl(k1, k0);
  48 }
  49 
  50 #if INCLUDE_RTM_OPT
  51 
  52 // Update rtm_counters based on abort status
  53 // input: abort_status
  54 //        rtm_counters (RTMLockingCounters*)
  55 // flags are killed
  56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
  57 
  58   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
  59   if (PrintPreciseRTMLockingStatistics) {
  60     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
  61       Label check_abort;
  62       testl(abort_status, (1<<i));
  63       jccb(Assembler::equal, check_abort);
  64       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
  65       bind(check_abort);
  66     }
  67   }
  68 }
  69 
  70 // Branch if (random & (count-1) != 0), count is 2^n
  71 // tmp, scr and flags are killed
  72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
  73   assert(tmp == rax, "");
  74   assert(scr == rdx, "");
  75   rdtsc(); // modifies EDX:EAX
  76   andptr(tmp, count-1);
  77   jccb(Assembler::notZero, brLabel);
  78 }
  79 
  80 // Perform abort ratio calculation, set no_rtm bit if high ratio
  81 // input:  rtm_counters_Reg (RTMLockingCounters* address)
  82 // tmpReg, rtm_counters_Reg and flags are killed
  83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
  84                                                     Register rtm_counters_Reg,
  85                                                     RTMLockingCounters* rtm_counters,
  86                                                     Metadata* method_data) {
  87   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
  88 
  89   if (RTMLockingCalculationDelay > 0) {
  90     // Delay calculation
  91     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
  92     testptr(tmpReg, tmpReg);
  93     jccb(Assembler::equal, L_done);
  94   }
  95   // Abort ratio calculation only if abort_count > RTMAbortThreshold
  96   //   Aborted transactions = abort_count * 100
  97   //   All transactions = total_count *  RTMTotalCountIncrRate
  98   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
  99 
 100   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 101   cmpptr(tmpReg, RTMAbortThreshold);
 102   jccb(Assembler::below, L_check_always_rtm2);
 103   imulptr(tmpReg, tmpReg, 100);
 104 
 105   Register scrReg = rtm_counters_Reg;
 106   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 107   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 108   imulptr(scrReg, scrReg, RTMAbortRatio);
 109   cmpptr(tmpReg, scrReg);
 110   jccb(Assembler::below, L_check_always_rtm1);
 111   if (method_data != NULL) {
 112     // set rtm_state to "no rtm" in MDO
 113     mov_metadata(tmpReg, method_data);
 114     lock();
 115     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 116   }
 117   jmpb(L_done);
 118   bind(L_check_always_rtm1);
 119   // Reload RTMLockingCounters* address
 120   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 121   bind(L_check_always_rtm2);
 122   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 123   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 124   jccb(Assembler::below, L_done);
 125   if (method_data != NULL) {
 126     // set rtm_state to "always rtm" in MDO
 127     mov_metadata(tmpReg, method_data);
 128     lock();
 129     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 130   }
 131   bind(L_done);
 132 }
 133 
 134 // Update counters and perform abort ratio calculation
 135 // input:  abort_status_Reg
 136 // rtm_counters_Reg, flags are killed
 137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 138                                       Register rtm_counters_Reg,
 139                                       RTMLockingCounters* rtm_counters,
 140                                       Metadata* method_data,
 141                                       bool profile_rtm) {
 142 
 143   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 144   // update rtm counters based on rax value at abort
 145   // reads abort_status_Reg, updates flags
 146   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 147   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 148   if (profile_rtm) {
 149     // Save abort status because abort_status_Reg is used by following code.
 150     if (RTMRetryCount > 0) {
 151       push(abort_status_Reg);
 152     }
 153     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 154     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 155     // restore abort status
 156     if (RTMRetryCount > 0) {
 157       pop(abort_status_Reg);
 158     }
 159   }
 160 }
 161 
 162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 163 // inputs: retry_count_Reg
 164 //       : abort_status_Reg
 165 // output: retry_count_Reg decremented by 1
 166 // flags are killed
 167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 168   Label doneRetry;
 169   assert(abort_status_Reg == rax, "");
 170   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 171   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 172   // if reason is in 0x6 and retry count != 0 then retry
 173   andptr(abort_status_Reg, 0x6);
 174   jccb(Assembler::zero, doneRetry);
 175   testl(retry_count_Reg, retry_count_Reg);
 176   jccb(Assembler::zero, doneRetry);
 177   pause();
 178   decrementl(retry_count_Reg);
 179   jmp(retryLabel);
 180   bind(doneRetry);
 181 }
 182 
 183 // Spin and retry if lock is busy,
 184 // inputs: box_Reg (monitor address)
 185 //       : retry_count_Reg
 186 // output: retry_count_Reg decremented by 1
 187 //       : clear z flag if retry count exceeded
 188 // tmp_Reg, scr_Reg, flags are killed
 189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 190                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 191   Label SpinLoop, SpinExit, doneRetry;
 192   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 193 
 194   testl(retry_count_Reg, retry_count_Reg);
 195   jccb(Assembler::zero, doneRetry);
 196   decrementl(retry_count_Reg);
 197   movptr(scr_Reg, RTMSpinLoopCount);
 198 
 199   bind(SpinLoop);
 200   pause();
 201   decrementl(scr_Reg);
 202   jccb(Assembler::lessEqual, SpinExit);
 203   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 204   testptr(tmp_Reg, tmp_Reg);
 205   jccb(Assembler::notZero, SpinLoop);
 206 
 207   bind(SpinExit);
 208   jmp(retryLabel);
 209   bind(doneRetry);
 210   incrementl(retry_count_Reg); // clear z flag
 211 }
 212 
 213 // Use RTM for normal stack locks
 214 // Input: objReg (object to lock)
 215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 216                                          Register retry_on_abort_count_Reg,
 217                                          RTMLockingCounters* stack_rtm_counters,
 218                                          Metadata* method_data, bool profile_rtm,
 219                                          Label& DONE_LABEL, Label& IsInflated) {
 220   assert(UseRTMForStackLocks, "why call this otherwise?");
 221   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 222   assert(tmpReg == rax, "");
 223   assert(scrReg == rdx, "");
 224   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 225 
 226   if (RTMRetryCount > 0) {
 227     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 228     bind(L_rtm_retry);
 229   }
 230   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 231   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
 232   jcc(Assembler::notZero, IsInflated);
 233 
 234   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 235     Label L_noincrement;
 236     if (RTMTotalCountIncrRate > 1) {
 237       // tmpReg, scrReg and flags are killed
 238       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 239     }
 240     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 241     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 242     bind(L_noincrement);
 243   }
 244   xbegin(L_on_abort);
 245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 246   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
 247   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
 248   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 249 
 250   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 251   if (UseRTMXendForLockBusy) {
 252     xend();
 253     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 254     jmp(L_decrement_retry);
 255   }
 256   else {
 257     xabort(0);
 258   }
 259   bind(L_on_abort);
 260   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 261     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 262   }
 263   bind(L_decrement_retry);
 264   if (RTMRetryCount > 0) {
 265     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 266     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 267   }
 268 }
 269 
 270 // Use RTM for inflating locks
 271 // inputs: objReg (object to lock)
 272 //         boxReg (on-stack box address (displaced header location) - KILLED)
 273 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 275                                             Register scrReg, Register retry_on_busy_count_Reg,
 276                                             Register retry_on_abort_count_Reg,
 277                                             RTMLockingCounters* rtm_counters,
 278                                             Metadata* method_data, bool profile_rtm,
 279                                             Label& DONE_LABEL) {
 280   assert(UseRTMLocking, "why call this otherwise?");
 281   assert(tmpReg == rax, "");
 282   assert(scrReg == rdx, "");
 283   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 284   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 285 
 286   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 287   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 288   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 289 
 290   if (RTMRetryCount > 0) {
 291     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 292     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 293     bind(L_rtm_retry);
 294   }
 295   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 296     Label L_noincrement;
 297     if (RTMTotalCountIncrRate > 1) {
 298       // tmpReg, scrReg and flags are killed
 299       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 300     }
 301     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 302     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 303     bind(L_noincrement);
 304   }
 305   xbegin(L_on_abort);
 306   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 307   movptr(tmpReg, Address(tmpReg, owner_offset));
 308   testptr(tmpReg, tmpReg);
 309   jcc(Assembler::zero, DONE_LABEL);
 310   if (UseRTMXendForLockBusy) {
 311     xend();
 312     jmp(L_decrement_retry);
 313   }
 314   else {
 315     xabort(0);
 316   }
 317   bind(L_on_abort);
 318   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 319   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 320     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 321   }
 322   if (RTMRetryCount > 0) {
 323     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 324     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 325   }
 326 
 327   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 328   testptr(tmpReg, tmpReg) ;
 329   jccb(Assembler::notZero, L_decrement_retry) ;
 330 
 331   // Appears unlocked - try to swing _owner from null to non-null.
 332   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 333 #ifdef _LP64
 334   Register threadReg = r15_thread;
 335 #else
 336   get_thread(scrReg);
 337   Register threadReg = scrReg;
 338 #endif
 339   lock();
 340   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 341 
 342   if (RTMRetryCount > 0) {
 343     // success done else retry
 344     jccb(Assembler::equal, DONE_LABEL) ;
 345     bind(L_decrement_retry);
 346     // Spin and retry if lock is busy.
 347     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 348   }
 349   else {
 350     bind(L_decrement_retry);
 351   }
 352 }
 353 
 354 #endif //  INCLUDE_RTM_OPT
 355 
 356 // fast_lock and fast_unlock used by C2
 357 
 358 // Because the transitions from emitted code to the runtime
 359 // monitorenter/exit helper stubs are so slow it's critical that
 360 // we inline both the stack-locking fast path and the inflated fast path.
 361 //
 362 // See also: cmpFastLock and cmpFastUnlock.
 363 //
 364 // What follows is a specialized inline transliteration of the code
 365 // in enter() and exit(). If we're concerned about I$ bloat another
 366 // option would be to emit TrySlowEnter and TrySlowExit methods
 367 // at startup-time.  These methods would accept arguments as
 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 369 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 371 // In practice, however, the # of lock sites is bounded and is usually small.
 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 373 // if the processor uses simple bimodal branch predictors keyed by EIP
 374 // Since the helper routines would be called from multiple synchronization
 375 // sites.
 376 //
 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 379 // to those specialized methods.  That'd give us a mostly platform-independent
 380 // implementation that the JITs could optimize and inline at their pleasure.
 381 // Done correctly, the only time we'd need to cross to native could would be
 382 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 384 // (b) explicit barriers or fence operations.
 385 //
 386 // TODO:
 387 //
 388 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 389 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 390 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 391 //    the lock operators would typically be faster than reifying Self.
 392 //
 393 // *  Ideally I'd define the primitives as:
 394 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 395 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 396 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 397 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 398 //    Furthermore the register assignments are overconstrained, possibly resulting in
 399 //    sub-optimal code near the synchronization site.
 400 //
 401 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 402 //    Alternately, use a better sp-proximity test.
 403 //
 404 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 405 //    Either one is sufficient to uniquely identify a thread.
 406 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 407 //
 408 // *  Intrinsify notify() and notifyAll() for the common cases where the
 409 //    object is locked by the calling thread but the waitlist is empty.
 410 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 411 //
 412 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 413 //    But beware of excessive branch density on AMD Opterons.
 414 //
 415 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 416 //    or failure of the fast path.  If the fast path fails then we pass
 417 //    control to the slow path, typically in C.  In fast_lock and
 418 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 419 //    will emit a conditional branch immediately after the node.
 420 //    So we have branches to branches and lots of ICC.ZF games.
 421 //    Instead, it might be better to have C2 pass a "FailureLabel"
 422 //    into fast_lock and fast_unlock.  In the case of success, control
 423 //    will drop through the node.  ICC.ZF is undefined at exit.
 424 //    In the case of failure, the node will branch directly to the
 425 //    FailureLabel
 426 
 427 
 428 // obj: object to lock
 429 // box: on-stack box address (displaced header location) - KILLED
 430 // rax,: tmp -- KILLED
 431 // scr: tmp -- KILLED
 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 433                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 434                                  BiasedLockingCounters* counters,
 435                                  RTMLockingCounters* rtm_counters,
 436                                  RTMLockingCounters* stack_rtm_counters,
 437                                  Metadata* method_data,
 438                                  bool use_rtm, bool profile_rtm) {
 439   // Ensure the register assignments are disjoint
 440   assert(tmpReg == rax, "");
 441 
 442   if (use_rtm) {
 443     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 444   } else {
 445     assert(cx2Reg == noreg, "");
 446     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 447   }
 448 
 449   if (counters != NULL) {
 450     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 451   }
 452 
 453   // Possible cases that we'll encounter in fast_lock
 454   // ------------------------------------------------
 455   // * Inflated
 456   //    -- unlocked
 457   //    -- Locked
 458   //       = by self
 459   //       = by other
 460   // * biased
 461   //    -- by Self
 462   //    -- by other
 463   // * neutral
 464   // * stack-locked
 465   //    -- by self
 466   //       = sp-proximity test hits
 467   //       = sp-proximity test generates false-negative
 468   //    -- by other
 469   //
 470 
 471   Label IsInflated, DONE_LABEL;
 472 
 473   // it's stack-locked, biased or neutral
 474   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 475   // order to reduce the number of conditional branches in the most common cases.
 476   // Beware -- there's a subtle invariant that fetch of the markword
 477   // at [FETCH], below, will never observe a biased encoding (*101b).
 478   // If this invariant is not held we risk exclusion (safety) failure.
 479   if (UseBiasedLocking && !UseOptoBiasInlining) {
 480     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
 481   }
 482 
 483 #if INCLUDE_RTM_OPT
 484   if (UseRTMForStackLocks && use_rtm) {
 485     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 486                       stack_rtm_counters, method_data, profile_rtm,
 487                       DONE_LABEL, IsInflated);
 488   }
 489 #endif // INCLUDE_RTM_OPT
 490 
 491   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 492   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
 493   jccb(Assembler::notZero, IsInflated);
 494 
 495   // Attempt stack-locking ...
 496   orptr (tmpReg, markWord::unlocked_value);
 497   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 498   lock();
 499   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 500   if (counters != NULL) {
 501     cond_inc32(Assembler::equal,
 502                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 503   }
 504   jcc(Assembler::equal, DONE_LABEL);           // Success
 505 
 506   // Recursive locking.
 507   // The object is stack-locked: markword contains stack pointer to BasicLock.
 508   // Locked by current thread if difference with current SP is less than one page.
 509   subptr(tmpReg, rsp);
 510   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 511   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 512   movptr(Address(boxReg, 0), tmpReg);
 513   if (counters != NULL) {
 514     cond_inc32(Assembler::equal,
 515                ExternalAddress((address)counters->fast_path_entry_count_addr()));
 516   }
 517   jmp(DONE_LABEL);
 518 
 519   bind(IsInflated);
 520   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 521 
 522 #if INCLUDE_RTM_OPT
 523   // Use the same RTM locking code in 32- and 64-bit VM.
 524   if (use_rtm) {
 525     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 526                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 527   } else {
 528 #endif // INCLUDE_RTM_OPT
 529 
 530 #ifndef _LP64
 531   // The object is inflated.
 532 
 533   // boxReg refers to the on-stack BasicLock in the current frame.
 534   // We'd like to write:
 535   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 536   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 537   // additional latency as we have another ST in the store buffer that must drain.
 538 
 539   // avoid ST-before-CAS
 540   // register juggle because we need tmpReg for cmpxchgptr below
 541   movptr(scrReg, boxReg);
 542   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 543 
 544   // Optimistic form: consider XORL tmpReg,tmpReg
 545   movptr(tmpReg, NULL_WORD);
 546 
 547   // Appears unlocked - try to swing _owner from null to non-null.
 548   // Ideally, I'd manifest "Self" with get_thread and then attempt
 549   // to CAS the register containing Self into m->Owner.
 550   // But we don't have enough registers, so instead we can either try to CAS
 551   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 552   // we later store "Self" into m->Owner.  Transiently storing a stack address
 553   // (rsp or the address of the box) into  m->owner is harmless.
 554   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 555   lock();
 556   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 557   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 558   // If we weren't able to swing _owner from NULL to the BasicLock
 559   // then take the slow path.
 560   jccb  (Assembler::notZero, DONE_LABEL);
 561   // update _owner from BasicLock to thread
 562   get_thread (scrReg);                    // beware: clobbers ICCs
 563   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 564   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 565 
 566   // If the CAS fails we can either retry or pass control to the slow path.
 567   // We use the latter tactic.
 568   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 569   // If the CAS was successful ...
 570   //   Self has acquired the lock
 571   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 572   // Intentional fall-through into DONE_LABEL ...
 573 #else // _LP64
 574   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 575   movq(scrReg, tmpReg);
 576   xorq(tmpReg, tmpReg);
 577   lock();
 578   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 579   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 580   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 581   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
 582   // Intentional fall-through into DONE_LABEL ...
 583   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 584 #endif // _LP64
 585 #if INCLUDE_RTM_OPT
 586   } // use_rtm()
 587 #endif
 588   // DONE_LABEL is a hot target - we'd really like to place it at the
 589   // start of cache line by padding with NOPs.
 590   // See the AMD and Intel software optimization manuals for the
 591   // most efficient "long" NOP encodings.
 592   // Unfortunately none of our alignment mechanisms suffice.
 593   bind(DONE_LABEL);
 594 
 595   // At DONE_LABEL the icc ZFlag is set as follows ...
 596   // fast_unlock uses the same protocol.
 597   // ZFlag == 1 -> Success
 598   // ZFlag == 0 -> Failure - force control through the slow path
 599 }
 600 
 601 // obj: object to unlock
 602 // box: box address (displaced header location), killed.  Must be EAX.
 603 // tmp: killed, cannot be obj nor box.
 604 //
 605 // Some commentary on balanced locking:
 606 //
 607 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 608 // Methods that don't have provably balanced locking are forced to run in the
 609 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 610 // The interpreter provides two properties:
 611 // I1:  At return-time the interpreter automatically and quietly unlocks any
 612 //      objects acquired the current activation (frame).  Recall that the
 613 //      interpreter maintains an on-stack list of locks currently held by
 614 //      a frame.
 615 // I2:  If a method attempts to unlock an object that is not held by the
 616 //      the frame the interpreter throws IMSX.
 617 //
 618 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 619 // B() doesn't have provably balanced locking so it runs in the interpreter.
 620 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 621 // is still locked by A().
 622 //
 623 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 624 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 625 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 626 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 627 // Arguably given that the spec legislates the JNI case as undefined our implementation
 628 // could reasonably *avoid* checking owner in fast_unlock().
 629 // In the interest of performance we elide m->Owner==Self check in unlock.
 630 // A perfectly viable alternative is to elide the owner check except when
 631 // Xcheck:jni is enabled.
 632 
 633 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 634   assert(boxReg == rax, "");
 635   assert_different_registers(objReg, boxReg, tmpReg);
 636 
 637   Label DONE_LABEL, Stacked, CheckSucc;
 638 
 639   // Critically, the biased locking test must have precedence over
 640   // and appear before the (box->dhw == 0) recursive stack-lock test.
 641   if (UseBiasedLocking && !UseOptoBiasInlining) {
 642     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 643   }
 644 
 645 #if INCLUDE_RTM_OPT
 646   if (UseRTMForStackLocks && use_rtm) {
 647     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 648     Label L_regular_unlock;
 649     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 650     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
 651     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
 652     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 653     xend();                                                           // otherwise end...
 654     jmp(DONE_LABEL);                                                  // ... and we're done
 655     bind(L_regular_unlock);
 656   }
 657 #endif
 658 
 659   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
 660   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
 661   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 662   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 663   jccb  (Assembler::zero, Stacked);
 664 
 665   // It's inflated.
 666 #if INCLUDE_RTM_OPT
 667   if (use_rtm) {
 668     Label L_regular_inflated_unlock;
 669     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 670     movptr(boxReg, Address(tmpReg, owner_offset));
 671     testptr(boxReg, boxReg);
 672     jccb(Assembler::notZero, L_regular_inflated_unlock);
 673     xend();
 674     jmpb(DONE_LABEL);
 675     bind(L_regular_inflated_unlock);
 676   }
 677 #endif
 678 
 679   // Despite our balanced locking property we still check that m->_owner == Self
 680   // as java routines or native JNI code called by this thread might
 681   // have released the lock.
 682   // Refer to the comments in synchronizer.cpp for how we might encode extra
 683   // state in _succ so we can avoid fetching EntryList|cxq.
 684   //
 685   // I'd like to add more cases in fast_lock() and fast_unlock() --
 686   // such as recursive enter and exit -- but we have to be wary of
 687   // I$ bloat, T$ effects and BP$ effects.
 688   //
 689   // If there's no contention try a 1-0 exit.  That is, exit without
 690   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 691   // we detect and recover from the race that the 1-0 exit admits.
 692   //
 693   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 694   // before it STs null into _owner, releasing the lock.  Updates
 695   // to data protected by the critical section must be visible before
 696   // we drop the lock (and thus before any other thread could acquire
 697   // the lock and observe the fields protected by the lock).
 698   // IA32's memory-model is SPO, so STs are ordered with respect to
 699   // each other and there's no need for an explicit barrier (fence).
 700   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 701 #ifndef _LP64
 702   get_thread (boxReg);
 703 
 704   // Note that we could employ various encoding schemes to reduce
 705   // the number of loads below (currently 4) to just 2 or 3.
 706   // Refer to the comments in synchronizer.cpp.
 707   // In practice the chain of fetches doesn't seem to impact performance, however.
 708   xorptr(boxReg, boxReg);
 709   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 710   jccb  (Assembler::notZero, DONE_LABEL);
 711   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 712   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 713   jccb  (Assembler::notZero, CheckSucc);
 714   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 715   jmpb  (DONE_LABEL);
 716 
 717   bind (Stacked);
 718   // It's not inflated and it's not recursively stack-locked and it's not biased.
 719   // It must be stack-locked.
 720   // Try to reset the header to displaced header.
 721   // The "box" value on the stack is stable, so we can reload
 722   // and be assured we observe the same value as above.
 723   movptr(tmpReg, Address(boxReg, 0));
 724   lock();
 725   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 726   // Intention fall-thru into DONE_LABEL
 727 
 728   // DONE_LABEL is a hot target - we'd really like to place it at the
 729   // start of cache line by padding with NOPs.
 730   // See the AMD and Intel software optimization manuals for the
 731   // most efficient "long" NOP encodings.
 732   // Unfortunately none of our alignment mechanisms suffice.
 733   bind (CheckSucc);
 734 #else // _LP64
 735   // It's inflated
 736   xorptr(boxReg, boxReg);
 737   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 738   jccb  (Assembler::notZero, DONE_LABEL);
 739   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 740   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 741   jccb  (Assembler::notZero, CheckSucc);
 742   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 743   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 744   jmpb  (DONE_LABEL);
 745 
 746   // Try to avoid passing control into the slow_path ...
 747   Label LSuccess, LGoSlowPath ;
 748   bind  (CheckSucc);
 749 
 750   // The following optional optimization can be elided if necessary
 751   // Effectively: if (succ == null) goto slow path
 752   // The code reduces the window for a race, however,
 753   // and thus benefits performance.
 754   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 755   jccb  (Assembler::zero, LGoSlowPath);
 756 
 757   xorptr(boxReg, boxReg);
 758   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 759   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 760 
 761   // Memory barrier/fence
 762   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 763   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 764   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 765   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 766   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 767   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 768   lock(); addl(Address(rsp, 0), 0);
 769 
 770   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 771   jccb  (Assembler::notZero, LSuccess);
 772 
 773   // Rare inopportune interleaving - race.
 774   // The successor vanished in the small window above.
 775   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 776   // We need to ensure progress and succession.
 777   // Try to reacquire the lock.
 778   // If that fails then the new owner is responsible for succession and this
 779   // thread needs to take no further action and can exit via the fast path (success).
 780   // If the re-acquire succeeds then pass control into the slow path.
 781   // As implemented, this latter mode is horrible because we generated more
 782   // coherence traffic on the lock *and* artifically extended the critical section
 783   // length while by virtue of passing control into the slow path.
 784 
 785   // box is really RAX -- the following CMPXCHG depends on that binding
 786   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 787   lock();
 788   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 789   // There's no successor so we tried to regrab the lock.
 790   // If that didn't work, then another thread grabbed the
 791   // lock so we're done (and exit was a success).
 792   jccb  (Assembler::notEqual, LSuccess);
 793   // Intentional fall-through into slow path
 794 
 795   bind  (LGoSlowPath);
 796   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 797   jmpb  (DONE_LABEL);
 798 
 799   bind  (LSuccess);
 800   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 801   jmpb  (DONE_LABEL);
 802 
 803   bind  (Stacked);
 804   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 805   lock();
 806   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 807 
 808 #endif
 809   bind(DONE_LABEL);
 810 }
 811 
 812 //-------------------------------------------------------------------------------------------
 813 // Generic instructions support for use in .ad files C2 code generation
 814 
 815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 816   if (dst != src) {
 817     movdqu(dst, src);
 818   }
 819   if (opcode == Op_AbsVD) {
 820     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
 821   } else {
 822     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 823     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
 824   }
 825 }
 826 
 827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 828   if (opcode == Op_AbsVD) {
 829     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
 830   } else {
 831     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 832     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
 833   }
 834 }
 835 
 836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
 837   if (dst != src) {
 838     movdqu(dst, src);
 839   }
 840   if (opcode == Op_AbsVF) {
 841     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
 842   } else {
 843     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 844     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
 845   }
 846 }
 847 
 848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
 849   if (opcode == Op_AbsVF) {
 850     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
 851   } else {
 852     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 853     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
 854   }
 855 }
 856 
 857 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
 858   if (sign) {
 859     pmovsxbw(dst, src);
 860   } else {
 861     pmovzxbw(dst, src);
 862   }
 863 }
 864 
 865 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
 866   if (sign) {
 867     vpmovsxbw(dst, src, vector_len);
 868   } else {
 869     vpmovzxbw(dst, src, vector_len);
 870   }
 871 }
 872 
 873 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
 874   if (opcode == Op_RShiftVI) {
 875     psrad(dst, src);
 876   } else if (opcode == Op_LShiftVI) {
 877     pslld(dst, src);
 878   } else {
 879     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 880     psrld(dst, src);
 881   }
 882 }
 883 
 884 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 885   if (opcode == Op_RShiftVI) {
 886     vpsrad(dst, nds, src, vector_len);
 887   } else if (opcode == Op_LShiftVI) {
 888     vpslld(dst, nds, src, vector_len);
 889   } else {
 890     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
 891     vpsrld(dst, nds, src, vector_len);
 892   }
 893 }
 894 
 895 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
 896   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 897     psraw(dst, src);
 898   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 899     psllw(dst, src);
 900   } else {
 901     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 902     psrlw(dst, src);
 903   }
 904 }
 905 
 906 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 907   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
 908     vpsraw(dst, nds, src, vector_len);
 909   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
 910     vpsllw(dst, nds, src, vector_len);
 911   } else {
 912     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
 913     vpsrlw(dst, nds, src, vector_len);
 914   }
 915 }
 916 
 917 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
 918   if (opcode == Op_RShiftVL) {
 919     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
 920   } else if (opcode == Op_LShiftVL) {
 921     psllq(dst, src);
 922   } else {
 923     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 924     psrlq(dst, src);
 925   }
 926 }
 927 
 928 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 929   if (opcode == Op_RShiftVL) {
 930     evpsraq(dst, nds, src, vector_len);
 931   } else if (opcode == Op_LShiftVL) {
 932     vpsllq(dst, nds, src, vector_len);
 933   } else {
 934     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
 935     vpsrlq(dst, nds, src, vector_len);
 936   }
 937 }
 938 
 939 // Reductions for vectors of ints, longs, floats, and doubles.
 940 
 941 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
 942   int vector_len = Assembler::AVX_128bit;
 943 
 944   switch (opcode) {
 945     case Op_AndReductionV:  pand(dst, src); break;
 946     case Op_OrReductionV:   por (dst, src); break;
 947     case Op_XorReductionV:  pxor(dst, src); break;
 948 
 949     case Op_AddReductionVF: addss(dst, src); break;
 950     case Op_AddReductionVD: addsd(dst, src); break;
 951     case Op_AddReductionVI: paddd(dst, src); break;
 952     case Op_AddReductionVL: paddq(dst, src); break;
 953 
 954     case Op_MulReductionVF: mulss(dst, src); break;
 955     case Op_MulReductionVD: mulsd(dst, src); break;
 956     case Op_MulReductionVI: pmulld(dst, src); break;
 957     case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
 958 
 959     default: assert(false, "wrong opcode");
 960   }
 961 }
 962 
 963 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
 964   int vector_len = Assembler::AVX_256bit;
 965 
 966   switch (opcode) {
 967     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
 968     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
 969     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
 970 
 971     case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
 972     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
 973 
 974     case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
 975     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
 976 
 977     default: assert(false, "wrong opcode");
 978   }
 979 }
 980 
 981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
 982                                   XMMRegister dst, XMMRegister src,
 983                                   XMMRegister vtmp1, XMMRegister vtmp2) {
 984   switch (opcode) {
 985     case Op_AddReductionVF:
 986     case Op_MulReductionVF:
 987       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
 988       break;
 989 
 990     case Op_AddReductionVD:
 991     case Op_MulReductionVD:
 992       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
 993       break;
 994 
 995     default: assert(false, "wrong opcode");
 996   }
 997 }
 998 
 999 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1000                                 Register dst, Register src1, XMMRegister src2,
1001                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1002   switch (vlen) {
1003     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1004     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1005     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1006     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1007 
1008     default: assert(false, "wrong vector length");
1009   }
1010 }
1011 
1012 #ifdef _LP64
1013 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1014                                 Register dst, Register src1, XMMRegister src2,
1015                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1016   switch (vlen) {
1017     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1018     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1019     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1020 
1021     default: assert(false, "wrong vector length");
1022   }
1023 }
1024 #endif // _LP64
1025 
1026 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1027   switch (vlen) {
1028     case 2:
1029       assert(vtmp2 == xnoreg, "");
1030       reduce2F(opcode, dst, src, vtmp1);
1031       break;
1032     case 4:
1033       assert(vtmp2 == xnoreg, "");
1034       reduce4F(opcode, dst, src, vtmp1);
1035       break;
1036     case 8:
1037       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1038       break;
1039     case 16:
1040       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1041       break;
1042     default: assert(false, "wrong vector length");
1043   }
1044 }
1045 
1046 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1047   switch (vlen) {
1048     case 2:
1049       assert(vtmp2 == xnoreg, "");
1050       reduce2D(opcode, dst, src, vtmp1);
1051       break;
1052     case 4:
1053       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1054       break;
1055     case 8:
1056       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1057       break;
1058     default: assert(false, "wrong vector length");
1059   }
1060 }
1061 
1062 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1063   if (opcode == Op_AddReductionVI) {
1064     if (vtmp1 != src2) {
1065       movdqu(vtmp1, src2);
1066     }
1067     phaddd(vtmp1, vtmp1);
1068   } else {
1069     pshufd(vtmp1, src2, 0x1);
1070     reduce_operation_128(opcode, vtmp1, src2);
1071   }
1072   movdl(vtmp2, src1);
1073   reduce_operation_128(opcode, vtmp1, vtmp2);
1074   movdl(dst, vtmp1);
1075 }
1076 
1077 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1078   if (opcode == Op_AddReductionVI) {
1079     if (vtmp1 != src2) {
1080       movdqu(vtmp1, src2);
1081     }
1082     phaddd(vtmp1, src2);
1083     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1084   } else {
1085     pshufd(vtmp2, src2, 0xE);
1086     reduce_operation_128(opcode, vtmp2, src2);
1087     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1088   }
1089 }
1090 
1091 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1092   if (opcode == Op_AddReductionVI) {
1093     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1094     vextracti128_high(vtmp2, vtmp1);
1095     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1096     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1097   } else {
1098     vextracti128_high(vtmp1, src2);
1099     reduce_operation_128(opcode, vtmp1, src2);
1100     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1101   }
1102 }
1103 
1104 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1105   vextracti64x4_high(vtmp2, src2);
1106   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1107   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1108 }
1109 
1110 #ifdef _LP64
1111 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1112   pshufd(vtmp2, src2, 0xE);
1113   reduce_operation_128(opcode, vtmp2, src2);
1114   movdq(vtmp1, src1);
1115   reduce_operation_128(opcode, vtmp1, vtmp2);
1116   movdq(dst, vtmp1);
1117 }
1118 
1119 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1120   vextracti128_high(vtmp1, src2);
1121   reduce_operation_128(opcode, vtmp1, src2);
1122   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1123 }
1124 
1125 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1126   vextracti64x4_high(vtmp2, src2);
1127   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1128   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1129 }
1130 #endif // _LP64
1131 
1132 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1133   reduce_operation_128(opcode, dst, src);
1134   pshufd(vtmp, src, 0x1);
1135   reduce_operation_128(opcode, dst, vtmp);
1136 }
1137 
1138 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1139   reduce2F(opcode, dst, src, vtmp);
1140   pshufd(vtmp, src, 0x2);
1141   reduce_operation_128(opcode, dst, vtmp);
1142   pshufd(vtmp, src, 0x3);
1143   reduce_operation_128(opcode, dst, vtmp);
1144 }
1145 
1146 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1147   reduce4F(opcode, dst, src, vtmp2);
1148   vextractf128_high(vtmp2, src);
1149   reduce4F(opcode, dst, vtmp2, vtmp1);
1150 }
1151 
1152 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1153   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1154   vextracti64x4_high(vtmp1, src);
1155   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1156 }
1157 
1158 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1159   reduce_operation_128(opcode, dst, src);
1160   pshufd(vtmp, src, 0xE);
1161   reduce_operation_128(opcode, dst, vtmp);
1162 }
1163 
1164 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1165   reduce2D(opcode, dst, src, vtmp2);
1166   vextractf128_high(vtmp2, src);
1167   reduce2D(opcode, dst, vtmp2, vtmp1);
1168 }
1169 
1170 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1171   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1172   vextracti64x4_high(vtmp1, src);
1173   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1174 }
1175 
1176 //-------------------------------------------------------------------------------------------
1177 
1178 // IndexOf for constant substrings with size >= 8 chars
1179 // which don't need to be loaded through stack.
1180 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
1181                                          Register cnt1, Register cnt2,
1182                                          int int_cnt2,  Register result,
1183                                          XMMRegister vec, Register tmp,
1184                                          int ae) {
1185   ShortBranchVerifier sbv(this);
1186   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1187   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1188 
1189   // This method uses the pcmpestri instruction with bound registers
1190   //   inputs:
1191   //     xmm - substring
1192   //     rax - substring length (elements count)
1193   //     mem - scanned string
1194   //     rdx - string length (elements count)
1195   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1196   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1197   //   outputs:
1198   //     rcx - matched index in string
1199   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1200   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1201   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1202   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1203   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1204 
1205   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
1206         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
1207         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
1208 
1209   // Note, inline_string_indexOf() generates checks:
1210   // if (substr.count > string.count) return -1;
1211   // if (substr.count == 0) return 0;
1212   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
1213 
1214   // Load substring.
1215   if (ae == StrIntrinsicNode::UL) {
1216     pmovzxbw(vec, Address(str2, 0));
1217   } else {
1218     movdqu(vec, Address(str2, 0));
1219   }
1220   movl(cnt2, int_cnt2);
1221   movptr(result, str1); // string addr
1222 
1223   if (int_cnt2 > stride) {
1224     jmpb(SCAN_TO_SUBSTR);
1225 
1226     // Reload substr for rescan, this code
1227     // is executed only for large substrings (> 8 chars)
1228     bind(RELOAD_SUBSTR);
1229     if (ae == StrIntrinsicNode::UL) {
1230       pmovzxbw(vec, Address(str2, 0));
1231     } else {
1232       movdqu(vec, Address(str2, 0));
1233     }
1234     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
1235 
1236     bind(RELOAD_STR);
1237     // We came here after the beginning of the substring was
1238     // matched but the rest of it was not so we need to search
1239     // again. Start from the next element after the previous match.
1240 
1241     // cnt2 is number of substring reminding elements and
1242     // cnt1 is number of string reminding elements when cmp failed.
1243     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
1244     subl(cnt1, cnt2);
1245     addl(cnt1, int_cnt2);
1246     movl(cnt2, int_cnt2); // Now restore cnt2
1247 
1248     decrementl(cnt1);     // Shift to next element
1249     cmpl(cnt1, cnt2);
1250     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1251 
1252     addptr(result, (1<<scale1));
1253 
1254   } // (int_cnt2 > 8)
1255 
1256   // Scan string for start of substr in 16-byte vectors
1257   bind(SCAN_TO_SUBSTR);
1258   pcmpestri(vec, Address(result, 0), mode);
1259   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1260   subl(cnt1, stride);
1261   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1262   cmpl(cnt1, cnt2);
1263   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1264   addptr(result, 16);
1265   jmpb(SCAN_TO_SUBSTR);
1266 
1267   // Found a potential substr
1268   bind(FOUND_CANDIDATE);
1269   // Matched whole vector if first element matched (tmp(rcx) == 0).
1270   if (int_cnt2 == stride) {
1271     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
1272   } else { // int_cnt2 > 8
1273     jccb(Assembler::overflow, FOUND_SUBSTR);
1274   }
1275   // After pcmpestri tmp(rcx) contains matched element index
1276   // Compute start addr of substr
1277   lea(result, Address(result, tmp, scale1));
1278 
1279   // Make sure string is still long enough
1280   subl(cnt1, tmp);
1281   cmpl(cnt1, cnt2);
1282   if (int_cnt2 == stride) {
1283     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1284   } else { // int_cnt2 > 8
1285     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
1286   }
1287   // Left less then substring.
1288 
1289   bind(RET_NOT_FOUND);
1290   movl(result, -1);
1291   jmp(EXIT);
1292 
1293   if (int_cnt2 > stride) {
1294     // This code is optimized for the case when whole substring
1295     // is matched if its head is matched.
1296     bind(MATCH_SUBSTR_HEAD);
1297     pcmpestri(vec, Address(result, 0), mode);
1298     // Reload only string if does not match
1299     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
1300 
1301     Label CONT_SCAN_SUBSTR;
1302     // Compare the rest of substring (> 8 chars).
1303     bind(FOUND_SUBSTR);
1304     // First 8 chars are already matched.
1305     negptr(cnt2);
1306     addptr(cnt2, stride);
1307 
1308     bind(SCAN_SUBSTR);
1309     subl(cnt1, stride);
1310     cmpl(cnt2, -stride); // Do not read beyond substring
1311     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
1312     // Back-up strings to avoid reading beyond substring:
1313     // cnt1 = cnt1 - cnt2 + 8
1314     addl(cnt1, cnt2); // cnt2 is negative
1315     addl(cnt1, stride);
1316     movl(cnt2, stride); negptr(cnt2);
1317     bind(CONT_SCAN_SUBSTR);
1318     if (int_cnt2 < (int)G) {
1319       int tail_off1 = int_cnt2<<scale1;
1320       int tail_off2 = int_cnt2<<scale2;
1321       if (ae == StrIntrinsicNode::UL) {
1322         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
1323       } else {
1324         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
1325       }
1326       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
1327     } else {
1328       // calculate index in register to avoid integer overflow (int_cnt2*2)
1329       movl(tmp, int_cnt2);
1330       addptr(tmp, cnt2);
1331       if (ae == StrIntrinsicNode::UL) {
1332         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
1333       } else {
1334         movdqu(vec, Address(str2, tmp, scale2, 0));
1335       }
1336       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
1337     }
1338     // Need to reload strings pointers if not matched whole vector
1339     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1340     addptr(cnt2, stride);
1341     jcc(Assembler::negative, SCAN_SUBSTR);
1342     // Fall through if found full substring
1343 
1344   } // (int_cnt2 > 8)
1345 
1346   bind(RET_FOUND);
1347   // Found result if we matched full small substring.
1348   // Compute substr offset
1349   subptr(result, str1);
1350   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1351     shrl(result, 1); // index
1352   }
1353   bind(EXIT);
1354 
1355 } // string_indexofC8
1356 
1357 // Small strings are loaded through stack if they cross page boundary.
1358 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
1359                                        Register cnt1, Register cnt2,
1360                                        int int_cnt2,  Register result,
1361                                        XMMRegister vec, Register tmp,
1362                                        int ae) {
1363   ShortBranchVerifier sbv(this);
1364   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1365   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1366 
1367   //
1368   // int_cnt2 is length of small (< 8 chars) constant substring
1369   // or (-1) for non constant substring in which case its length
1370   // is in cnt2 register.
1371   //
1372   // Note, inline_string_indexOf() generates checks:
1373   // if (substr.count > string.count) return -1;
1374   // if (substr.count == 0) return 0;
1375   //
1376   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1377   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
1378   // This method uses the pcmpestri instruction with bound registers
1379   //   inputs:
1380   //     xmm - substring
1381   //     rax - substring length (elements count)
1382   //     mem - scanned string
1383   //     rdx - string length (elements count)
1384   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1385   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1386   //   outputs:
1387   //     rcx - matched index in string
1388   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1389   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1390   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1391   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1392 
1393   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
1394         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
1395         FOUND_CANDIDATE;
1396 
1397   { //========================================================
1398     // We don't know where these strings are located
1399     // and we can't read beyond them. Load them through stack.
1400     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
1401 
1402     movptr(tmp, rsp); // save old SP
1403 
1404     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
1405       if (int_cnt2 == (1>>scale2)) { // One byte
1406         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
1407         load_unsigned_byte(result, Address(str2, 0));
1408         movdl(vec, result); // move 32 bits
1409       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
1410         // Not enough header space in 32-bit VM: 12+3 = 15.
1411         movl(result, Address(str2, -1));
1412         shrl(result, 8);
1413         movdl(vec, result); // move 32 bits
1414       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
1415         load_unsigned_short(result, Address(str2, 0));
1416         movdl(vec, result); // move 32 bits
1417       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
1418         movdl(vec, Address(str2, 0)); // move 32 bits
1419       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
1420         movq(vec, Address(str2, 0));  // move 64 bits
1421       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
1422         // Array header size is 12 bytes in 32-bit VM
1423         // + 6 bytes for 3 chars == 18 bytes,
1424         // enough space to load vec and shift.
1425         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
1426         if (ae == StrIntrinsicNode::UL) {
1427           int tail_off = int_cnt2-8;
1428           pmovzxbw(vec, Address(str2, tail_off));
1429           psrldq(vec, -2*tail_off);
1430         }
1431         else {
1432           int tail_off = int_cnt2*(1<<scale2);
1433           movdqu(vec, Address(str2, tail_off-16));
1434           psrldq(vec, 16-tail_off);
1435         }
1436       }
1437     } else { // not constant substring
1438       cmpl(cnt2, stride);
1439       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
1440 
1441       // We can read beyond string if srt+16 does not cross page boundary
1442       // since heaps are aligned and mapped by pages.
1443       assert(os::vm_page_size() < (int)G, "default page should be small");
1444       movl(result, str2); // We need only low 32 bits
1445       andl(result, (os::vm_page_size()-1));
1446       cmpl(result, (os::vm_page_size()-16));
1447       jccb(Assembler::belowEqual, CHECK_STR);
1448 
1449       // Move small strings to stack to allow load 16 bytes into vec.
1450       subptr(rsp, 16);
1451       int stk_offset = wordSize-(1<<scale2);
1452       push(cnt2);
1453 
1454       bind(COPY_SUBSTR);
1455       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
1456         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
1457         movb(Address(rsp, cnt2, scale2, stk_offset), result);
1458       } else if (ae == StrIntrinsicNode::UU) {
1459         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
1460         movw(Address(rsp, cnt2, scale2, stk_offset), result);
1461       }
1462       decrement(cnt2);
1463       jccb(Assembler::notZero, COPY_SUBSTR);
1464 
1465       pop(cnt2);
1466       movptr(str2, rsp);  // New substring address
1467     } // non constant
1468 
1469     bind(CHECK_STR);
1470     cmpl(cnt1, stride);
1471     jccb(Assembler::aboveEqual, BIG_STRINGS);
1472 
1473     // Check cross page boundary.
1474     movl(result, str1); // We need only low 32 bits
1475     andl(result, (os::vm_page_size()-1));
1476     cmpl(result, (os::vm_page_size()-16));
1477     jccb(Assembler::belowEqual, BIG_STRINGS);
1478 
1479     subptr(rsp, 16);
1480     int stk_offset = -(1<<scale1);
1481     if (int_cnt2 < 0) { // not constant
1482       push(cnt2);
1483       stk_offset += wordSize;
1484     }
1485     movl(cnt2, cnt1);
1486 
1487     bind(COPY_STR);
1488     if (ae == StrIntrinsicNode::LL) {
1489       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
1490       movb(Address(rsp, cnt2, scale1, stk_offset), result);
1491     } else {
1492       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
1493       movw(Address(rsp, cnt2, scale1, stk_offset), result);
1494     }
1495     decrement(cnt2);
1496     jccb(Assembler::notZero, COPY_STR);
1497 
1498     if (int_cnt2 < 0) { // not constant
1499       pop(cnt2);
1500     }
1501     movptr(str1, rsp);  // New string address
1502 
1503     bind(BIG_STRINGS);
1504     // Load substring.
1505     if (int_cnt2 < 0) { // -1
1506       if (ae == StrIntrinsicNode::UL) {
1507         pmovzxbw(vec, Address(str2, 0));
1508       } else {
1509         movdqu(vec, Address(str2, 0));
1510       }
1511       push(cnt2);       // substr count
1512       push(str2);       // substr addr
1513       push(str1);       // string addr
1514     } else {
1515       // Small (< 8 chars) constant substrings are loaded already.
1516       movl(cnt2, int_cnt2);
1517     }
1518     push(tmp);  // original SP
1519 
1520   } // Finished loading
1521 
1522   //========================================================
1523   // Start search
1524   //
1525 
1526   movptr(result, str1); // string addr
1527 
1528   if (int_cnt2  < 0) {  // Only for non constant substring
1529     jmpb(SCAN_TO_SUBSTR);
1530 
1531     // SP saved at sp+0
1532     // String saved at sp+1*wordSize
1533     // Substr saved at sp+2*wordSize
1534     // Substr count saved at sp+3*wordSize
1535 
1536     // Reload substr for rescan, this code
1537     // is executed only for large substrings (> 8 chars)
1538     bind(RELOAD_SUBSTR);
1539     movptr(str2, Address(rsp, 2*wordSize));
1540     movl(cnt2, Address(rsp, 3*wordSize));
1541     if (ae == StrIntrinsicNode::UL) {
1542       pmovzxbw(vec, Address(str2, 0));
1543     } else {
1544       movdqu(vec, Address(str2, 0));
1545     }
1546     // We came here after the beginning of the substring was
1547     // matched but the rest of it was not so we need to search
1548     // again. Start from the next element after the previous match.
1549     subptr(str1, result); // Restore counter
1550     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1551       shrl(str1, 1);
1552     }
1553     addl(cnt1, str1);
1554     decrementl(cnt1);   // Shift to next element
1555     cmpl(cnt1, cnt2);
1556     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1557 
1558     addptr(result, (1<<scale1));
1559   } // non constant
1560 
1561   // Scan string for start of substr in 16-byte vectors
1562   bind(SCAN_TO_SUBSTR);
1563   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1564   pcmpestri(vec, Address(result, 0), mode);
1565   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1566   subl(cnt1, stride);
1567   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1568   cmpl(cnt1, cnt2);
1569   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1570   addptr(result, 16);
1571 
1572   bind(ADJUST_STR);
1573   cmpl(cnt1, stride); // Do not read beyond string
1574   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1575   // Back-up string to avoid reading beyond string.
1576   lea(result, Address(result, cnt1, scale1, -16));
1577   movl(cnt1, stride);
1578   jmpb(SCAN_TO_SUBSTR);
1579 
1580   // Found a potential substr
1581   bind(FOUND_CANDIDATE);
1582   // After pcmpestri tmp(rcx) contains matched element index
1583 
1584   // Make sure string is still long enough
1585   subl(cnt1, tmp);
1586   cmpl(cnt1, cnt2);
1587   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
1588   // Left less then substring.
1589 
1590   bind(RET_NOT_FOUND);
1591   movl(result, -1);
1592   jmp(CLEANUP);
1593 
1594   bind(FOUND_SUBSTR);
1595   // Compute start addr of substr
1596   lea(result, Address(result, tmp, scale1));
1597   if (int_cnt2 > 0) { // Constant substring
1598     // Repeat search for small substring (< 8 chars)
1599     // from new point without reloading substring.
1600     // Have to check that we don't read beyond string.
1601     cmpl(tmp, stride-int_cnt2);
1602     jccb(Assembler::greater, ADJUST_STR);
1603     // Fall through if matched whole substring.
1604   } else { // non constant
1605     assert(int_cnt2 == -1, "should be != 0");
1606 
1607     addl(tmp, cnt2);
1608     // Found result if we matched whole substring.
1609     cmpl(tmp, stride);
1610     jcc(Assembler::lessEqual, RET_FOUND);
1611 
1612     // Repeat search for small substring (<= 8 chars)
1613     // from new point 'str1' without reloading substring.
1614     cmpl(cnt2, stride);
1615     // Have to check that we don't read beyond string.
1616     jccb(Assembler::lessEqual, ADJUST_STR);
1617 
1618     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
1619     // Compare the rest of substring (> 8 chars).
1620     movptr(str1, result);
1621 
1622     cmpl(tmp, cnt2);
1623     // First 8 chars are already matched.
1624     jccb(Assembler::equal, CHECK_NEXT);
1625 
1626     bind(SCAN_SUBSTR);
1627     pcmpestri(vec, Address(str1, 0), mode);
1628     // Need to reload strings pointers if not matched whole vector
1629     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1630 
1631     bind(CHECK_NEXT);
1632     subl(cnt2, stride);
1633     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
1634     addptr(str1, 16);
1635     if (ae == StrIntrinsicNode::UL) {
1636       addptr(str2, 8);
1637     } else {
1638       addptr(str2, 16);
1639     }
1640     subl(cnt1, stride);
1641     cmpl(cnt2, stride); // Do not read beyond substring
1642     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
1643     // Back-up strings to avoid reading beyond substring.
1644 
1645     if (ae == StrIntrinsicNode::UL) {
1646       lea(str2, Address(str2, cnt2, scale2, -8));
1647       lea(str1, Address(str1, cnt2, scale1, -16));
1648     } else {
1649       lea(str2, Address(str2, cnt2, scale2, -16));
1650       lea(str1, Address(str1, cnt2, scale1, -16));
1651     }
1652     subl(cnt1, cnt2);
1653     movl(cnt2, stride);
1654     addl(cnt1, stride);
1655     bind(CONT_SCAN_SUBSTR);
1656     if (ae == StrIntrinsicNode::UL) {
1657       pmovzxbw(vec, Address(str2, 0));
1658     } else {
1659       movdqu(vec, Address(str2, 0));
1660     }
1661     jmp(SCAN_SUBSTR);
1662 
1663     bind(RET_FOUND_LONG);
1664     movptr(str1, Address(rsp, wordSize));
1665   } // non constant
1666 
1667   bind(RET_FOUND);
1668   // Compute substr offset
1669   subptr(result, str1);
1670   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1671     shrl(result, 1); // index
1672   }
1673   bind(CLEANUP);
1674   pop(rsp); // restore SP
1675 
1676 } // string_indexof
1677 
1678 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1679                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
1680   ShortBranchVerifier sbv(this);
1681   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1682 
1683   int stride = 8;
1684 
1685   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
1686         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
1687         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
1688         FOUND_SEQ_CHAR, DONE_LABEL;
1689 
1690   movptr(result, str1);
1691   if (UseAVX >= 2) {
1692     cmpl(cnt1, stride);
1693     jcc(Assembler::less, SCAN_TO_CHAR);
1694     cmpl(cnt1, 2*stride);
1695     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
1696     movdl(vec1, ch);
1697     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
1698     vpxor(vec2, vec2);
1699     movl(tmp, cnt1);
1700     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
1701     andl(cnt1,0x0000000F);  //tail count (in chars)
1702 
1703     bind(SCAN_TO_16_CHAR_LOOP);
1704     vmovdqu(vec3, Address(result, 0));
1705     vpcmpeqw(vec3, vec3, vec1, 1);
1706     vptest(vec2, vec3);
1707     jcc(Assembler::carryClear, FOUND_CHAR);
1708     addptr(result, 32);
1709     subl(tmp, 2*stride);
1710     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
1711     jmp(SCAN_TO_8_CHAR);
1712     bind(SCAN_TO_8_CHAR_INIT);
1713     movdl(vec1, ch);
1714     pshuflw(vec1, vec1, 0x00);
1715     pshufd(vec1, vec1, 0);
1716     pxor(vec2, vec2);
1717   }
1718   bind(SCAN_TO_8_CHAR);
1719   cmpl(cnt1, stride);
1720   jcc(Assembler::less, SCAN_TO_CHAR);
1721   if (UseAVX < 2) {
1722     movdl(vec1, ch);
1723     pshuflw(vec1, vec1, 0x00);
1724     pshufd(vec1, vec1, 0);
1725     pxor(vec2, vec2);
1726   }
1727   movl(tmp, cnt1);
1728   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
1729   andl(cnt1,0x00000007);  //tail count (in chars)
1730 
1731   bind(SCAN_TO_8_CHAR_LOOP);
1732   movdqu(vec3, Address(result, 0));
1733   pcmpeqw(vec3, vec1);
1734   ptest(vec2, vec3);
1735   jcc(Assembler::carryClear, FOUND_CHAR);
1736   addptr(result, 16);
1737   subl(tmp, stride);
1738   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
1739   bind(SCAN_TO_CHAR);
1740   testl(cnt1, cnt1);
1741   jcc(Assembler::zero, RET_NOT_FOUND);
1742   bind(SCAN_TO_CHAR_LOOP);
1743   load_unsigned_short(tmp, Address(result, 0));
1744   cmpl(ch, tmp);
1745   jccb(Assembler::equal, FOUND_SEQ_CHAR);
1746   addptr(result, 2);
1747   subl(cnt1, 1);
1748   jccb(Assembler::zero, RET_NOT_FOUND);
1749   jmp(SCAN_TO_CHAR_LOOP);
1750 
1751   bind(RET_NOT_FOUND);
1752   movl(result, -1);
1753   jmpb(DONE_LABEL);
1754 
1755   bind(FOUND_CHAR);
1756   if (UseAVX >= 2) {
1757     vpmovmskb(tmp, vec3);
1758   } else {
1759     pmovmskb(tmp, vec3);
1760   }
1761   bsfl(ch, tmp);
1762   addl(result, ch);
1763 
1764   bind(FOUND_SEQ_CHAR);
1765   subptr(result, str1);
1766   shrl(result, 1);
1767 
1768   bind(DONE_LABEL);
1769 } // string_indexof_char
1770 
1771 // helper function for string_compare
1772 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1773                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
1774                                            Address::ScaleFactor scale2, Register index, int ae) {
1775   if (ae == StrIntrinsicNode::LL) {
1776     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
1777     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
1778   } else if (ae == StrIntrinsicNode::UU) {
1779     load_unsigned_short(elem1, Address(str1, index, scale, 0));
1780     load_unsigned_short(elem2, Address(str2, index, scale, 0));
1781   } else {
1782     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
1783     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
1784   }
1785 }
1786 
1787 // Compare strings, used for char[] and byte[].
1788 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1789                                        Register cnt1, Register cnt2, Register result,
1790                                        XMMRegister vec1, int ae) {
1791   ShortBranchVerifier sbv(this);
1792   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
1793   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
1794   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
1795   int stride2x2 = 0x40;
1796   Address::ScaleFactor scale = Address::no_scale;
1797   Address::ScaleFactor scale1 = Address::no_scale;
1798   Address::ScaleFactor scale2 = Address::no_scale;
1799 
1800   if (ae != StrIntrinsicNode::LL) {
1801     stride2x2 = 0x20;
1802   }
1803 
1804   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
1805     shrl(cnt2, 1);
1806   }
1807   // Compute the minimum of the string lengths and the
1808   // difference of the string lengths (stack).
1809   // Do the conditional move stuff
1810   movl(result, cnt1);
1811   subl(cnt1, cnt2);
1812   push(cnt1);
1813   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
1814 
1815   // Is the minimum length zero?
1816   testl(cnt2, cnt2);
1817   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1818   if (ae == StrIntrinsicNode::LL) {
1819     // Load first bytes
1820     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
1821     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
1822   } else if (ae == StrIntrinsicNode::UU) {
1823     // Load first characters
1824     load_unsigned_short(result, Address(str1, 0));
1825     load_unsigned_short(cnt1, Address(str2, 0));
1826   } else {
1827     load_unsigned_byte(result, Address(str1, 0));
1828     load_unsigned_short(cnt1, Address(str2, 0));
1829   }
1830   subl(result, cnt1);
1831   jcc(Assembler::notZero,  POP_LABEL);
1832 
1833   if (ae == StrIntrinsicNode::UU) {
1834     // Divide length by 2 to get number of chars
1835     shrl(cnt2, 1);
1836   }
1837   cmpl(cnt2, 1);
1838   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1839 
1840   // Check if the strings start at the same location and setup scale and stride
1841   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1842     cmpptr(str1, str2);
1843     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1844     if (ae == StrIntrinsicNode::LL) {
1845       scale = Address::times_1;
1846       stride = 16;
1847     } else {
1848       scale = Address::times_2;
1849       stride = 8;
1850     }
1851   } else {
1852     scale1 = Address::times_1;
1853     scale2 = Address::times_2;
1854     // scale not used
1855     stride = 8;
1856   }
1857 
1858   if (UseAVX >= 2 && UseSSE42Intrinsics) {
1859     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
1860     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
1861     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
1862     Label COMPARE_TAIL_LONG;
1863     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
1864 
1865     int pcmpmask = 0x19;
1866     if (ae == StrIntrinsicNode::LL) {
1867       pcmpmask &= ~0x01;
1868     }
1869 
1870     // Setup to compare 16-chars (32-bytes) vectors,
1871     // start from first character again because it has aligned address.
1872     if (ae == StrIntrinsicNode::LL) {
1873       stride2 = 32;
1874     } else {
1875       stride2 = 16;
1876     }
1877     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1878       adr_stride = stride << scale;
1879     } else {
1880       adr_stride1 = 8;  //stride << scale1;
1881       adr_stride2 = 16; //stride << scale2;
1882     }
1883 
1884     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
1885     // rax and rdx are used by pcmpestri as elements counters
1886     movl(result, cnt2);
1887     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
1888     jcc(Assembler::zero, COMPARE_TAIL_LONG);
1889 
1890     // fast path : compare first 2 8-char vectors.
1891     bind(COMPARE_16_CHARS);
1892     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1893       movdqu(vec1, Address(str1, 0));
1894     } else {
1895       pmovzxbw(vec1, Address(str1, 0));
1896     }
1897     pcmpestri(vec1, Address(str2, 0), pcmpmask);
1898     jccb(Assembler::below, COMPARE_INDEX_CHAR);
1899 
1900     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1901       movdqu(vec1, Address(str1, adr_stride));
1902       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
1903     } else {
1904       pmovzxbw(vec1, Address(str1, adr_stride1));
1905       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
1906     }
1907     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
1908     addl(cnt1, stride);
1909 
1910     // Compare the characters at index in cnt1
1911     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
1912     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
1913     subl(result, cnt2);
1914     jmp(POP_LABEL);
1915 
1916     // Setup the registers to start vector comparison loop
1917     bind(COMPARE_WIDE_VECTORS);
1918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1919       lea(str1, Address(str1, result, scale));
1920       lea(str2, Address(str2, result, scale));
1921     } else {
1922       lea(str1, Address(str1, result, scale1));
1923       lea(str2, Address(str2, result, scale2));
1924     }
1925     subl(result, stride2);
1926     subl(cnt2, stride2);
1927     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
1928     negptr(result);
1929 
1930     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
1931     bind(COMPARE_WIDE_VECTORS_LOOP);
1932 
1933 #ifdef _LP64
1934     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
1935       cmpl(cnt2, stride2x2);
1936       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
1937       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
1938       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
1939 
1940       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
1941       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1942         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
1943         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1944       } else {
1945         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
1946         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1947       }
1948       kortestql(k7, k7);
1949       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
1950       addptr(result, stride2x2);  // update since we already compared at this addr
1951       subl(cnt2, stride2x2);      // and sub the size too
1952       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
1953 
1954       vpxor(vec1, vec1);
1955       jmpb(COMPARE_WIDE_TAIL);
1956     }//if (VM_Version::supports_avx512vlbw())
1957 #endif // _LP64
1958 
1959 
1960     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1961     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1962       vmovdqu(vec1, Address(str1, result, scale));
1963       vpxor(vec1, Address(str2, result, scale));
1964     } else {
1965       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
1966       vpxor(vec1, Address(str2, result, scale2));
1967     }
1968     vptest(vec1, vec1);
1969     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
1970     addptr(result, stride2);
1971     subl(cnt2, stride2);
1972     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
1973     // clean upper bits of YMM registers
1974     vpxor(vec1, vec1);
1975 
1976     // compare wide vectors tail
1977     bind(COMPARE_WIDE_TAIL);
1978     testptr(result, result);
1979     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1980 
1981     movl(result, stride2);
1982     movl(cnt2, result);
1983     negptr(result);
1984     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1985 
1986     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
1987     bind(VECTOR_NOT_EQUAL);
1988     // clean upper bits of YMM registers
1989     vpxor(vec1, vec1);
1990     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1991       lea(str1, Address(str1, result, scale));
1992       lea(str2, Address(str2, result, scale));
1993     } else {
1994       lea(str1, Address(str1, result, scale1));
1995       lea(str2, Address(str2, result, scale2));
1996     }
1997     jmp(COMPARE_16_CHARS);
1998 
1999     // Compare tail chars, length between 1 to 15 chars
2000     bind(COMPARE_TAIL_LONG);
2001     movl(cnt2, result);
2002     cmpl(cnt2, stride);
2003     jcc(Assembler::less, COMPARE_SMALL_STR);
2004 
2005     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2006       movdqu(vec1, Address(str1, 0));
2007     } else {
2008       pmovzxbw(vec1, Address(str1, 0));
2009     }
2010     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2011     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2012     subptr(cnt2, stride);
2013     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2014     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2015       lea(str1, Address(str1, result, scale));
2016       lea(str2, Address(str2, result, scale));
2017     } else {
2018       lea(str1, Address(str1, result, scale1));
2019       lea(str2, Address(str2, result, scale2));
2020     }
2021     negptr(cnt2);
2022     jmpb(WHILE_HEAD_LABEL);
2023 
2024     bind(COMPARE_SMALL_STR);
2025   } else if (UseSSE42Intrinsics) {
2026     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2027     int pcmpmask = 0x19;
2028     // Setup to compare 8-char (16-byte) vectors,
2029     // start from first character again because it has aligned address.
2030     movl(result, cnt2);
2031     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2032     if (ae == StrIntrinsicNode::LL) {
2033       pcmpmask &= ~0x01;
2034     }
2035     jcc(Assembler::zero, COMPARE_TAIL);
2036     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2037       lea(str1, Address(str1, result, scale));
2038       lea(str2, Address(str2, result, scale));
2039     } else {
2040       lea(str1, Address(str1, result, scale1));
2041       lea(str2, Address(str2, result, scale2));
2042     }
2043     negptr(result);
2044 
2045     // pcmpestri
2046     //   inputs:
2047     //     vec1- substring
2048     //     rax - negative string length (elements count)
2049     //     mem - scanned string
2050     //     rdx - string length (elements count)
2051     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2052     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2053     //   outputs:
2054     //     rcx - first mismatched element index
2055     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2056 
2057     bind(COMPARE_WIDE_VECTORS);
2058     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2059       movdqu(vec1, Address(str1, result, scale));
2060       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2061     } else {
2062       pmovzxbw(vec1, Address(str1, result, scale1));
2063       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2064     }
2065     // After pcmpestri cnt1(rcx) contains mismatched element index
2066 
2067     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2068     addptr(result, stride);
2069     subptr(cnt2, stride);
2070     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2071 
2072     // compare wide vectors tail
2073     testptr(result, result);
2074     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2075 
2076     movl(cnt2, stride);
2077     movl(result, stride);
2078     negptr(result);
2079     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2080       movdqu(vec1, Address(str1, result, scale));
2081       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2082     } else {
2083       pmovzxbw(vec1, Address(str1, result, scale1));
2084       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2085     }
2086     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2087 
2088     // Mismatched characters in the vectors
2089     bind(VECTOR_NOT_EQUAL);
2090     addptr(cnt1, result);
2091     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2092     subl(result, cnt2);
2093     jmpb(POP_LABEL);
2094 
2095     bind(COMPARE_TAIL); // limit is zero
2096     movl(cnt2, result);
2097     // Fallthru to tail compare
2098   }
2099   // Shift str2 and str1 to the end of the arrays, negate min
2100   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2101     lea(str1, Address(str1, cnt2, scale));
2102     lea(str2, Address(str2, cnt2, scale));
2103   } else {
2104     lea(str1, Address(str1, cnt2, scale1));
2105     lea(str2, Address(str2, cnt2, scale2));
2106   }
2107   decrementl(cnt2);  // first character was compared already
2108   negptr(cnt2);
2109 
2110   // Compare the rest of the elements
2111   bind(WHILE_HEAD_LABEL);
2112   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2113   subl(result, cnt1);
2114   jccb(Assembler::notZero, POP_LABEL);
2115   increment(cnt2);
2116   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2117 
2118   // Strings are equal up to min length.  Return the length difference.
2119   bind(LENGTH_DIFF_LABEL);
2120   pop(result);
2121   if (ae == StrIntrinsicNode::UU) {
2122     // Divide diff by 2 to get number of chars
2123     sarl(result, 1);
2124   }
2125   jmpb(DONE_LABEL);
2126 
2127 #ifdef _LP64
2128   if (VM_Version::supports_avx512vlbw()) {
2129 
2130     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2131 
2132     kmovql(cnt1, k7);
2133     notq(cnt1);
2134     bsfq(cnt2, cnt1);
2135     if (ae != StrIntrinsicNode::LL) {
2136       // Divide diff by 2 to get number of chars
2137       sarl(cnt2, 1);
2138     }
2139     addq(result, cnt2);
2140     if (ae == StrIntrinsicNode::LL) {
2141       load_unsigned_byte(cnt1, Address(str2, result));
2142       load_unsigned_byte(result, Address(str1, result));
2143     } else if (ae == StrIntrinsicNode::UU) {
2144       load_unsigned_short(cnt1, Address(str2, result, scale));
2145       load_unsigned_short(result, Address(str1, result, scale));
2146     } else {
2147       load_unsigned_short(cnt1, Address(str2, result, scale2));
2148       load_unsigned_byte(result, Address(str1, result, scale1));
2149     }
2150     subl(result, cnt1);
2151     jmpb(POP_LABEL);
2152   }//if (VM_Version::supports_avx512vlbw())
2153 #endif // _LP64
2154 
2155   // Discard the stored length difference
2156   bind(POP_LABEL);
2157   pop(cnt1);
2158 
2159   // That's it
2160   bind(DONE_LABEL);
2161   if(ae == StrIntrinsicNode::UL) {
2162     negl(result);
2163   }
2164 
2165 }
2166 
2167 // Search for Non-ASCII character (Negative byte value) in a byte array,
2168 // return true if it has any and false otherwise.
2169 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
2170 //   @HotSpotIntrinsicCandidate
2171 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
2172 //     for (int i = off; i < off + len; i++) {
2173 //       if (ba[i] < 0) {
2174 //         return true;
2175 //       }
2176 //     }
2177 //     return false;
2178 //   }
2179 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
2180   Register result, Register tmp1,
2181   XMMRegister vec1, XMMRegister vec2) {
2182   // rsi: byte array
2183   // rcx: len
2184   // rax: result
2185   ShortBranchVerifier sbv(this);
2186   assert_different_registers(ary1, len, result, tmp1);
2187   assert_different_registers(vec1, vec2);
2188   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
2189 
2190   // len == 0
2191   testl(len, len);
2192   jcc(Assembler::zero, FALSE_LABEL);
2193 
2194   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
2195     VM_Version::supports_avx512vlbw() &&
2196     VM_Version::supports_bmi2()) {
2197 
2198     Label test_64_loop, test_tail;
2199     Register tmp3_aliased = len;
2200 
2201     movl(tmp1, len);
2202     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
2203 
2204     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
2205     andl(len, ~(64 - 1));    // vector count (in chars)
2206     jccb(Assembler::zero, test_tail);
2207 
2208     lea(ary1, Address(ary1, len, Address::times_1));
2209     negptr(len);
2210 
2211     bind(test_64_loop);
2212     // Check whether our 64 elements of size byte contain negatives
2213     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
2214     kortestql(k2, k2);
2215     jcc(Assembler::notZero, TRUE_LABEL);
2216 
2217     addptr(len, 64);
2218     jccb(Assembler::notZero, test_64_loop);
2219 
2220 
2221     bind(test_tail);
2222     // bail out when there is nothing to be done
2223     testl(tmp1, -1);
2224     jcc(Assembler::zero, FALSE_LABEL);
2225 
2226     // ~(~0 << len) applied up to two times (for 32-bit scenario)
2227 #ifdef _LP64
2228     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
2229     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
2230     notq(tmp3_aliased);
2231     kmovql(k3, tmp3_aliased);
2232 #else
2233     Label k_init;
2234     jmp(k_init);
2235 
2236     // We could not read 64-bits from a general purpose register thus we move
2237     // data required to compose 64 1's to the instruction stream
2238     // We emit 64 byte wide series of elements from 0..63 which later on would
2239     // be used as a compare targets with tail count contained in tmp1 register.
2240     // Result would be a k register having tmp1 consecutive number or 1
2241     // counting from least significant bit.
2242     address tmp = pc();
2243     emit_int64(0x0706050403020100);
2244     emit_int64(0x0F0E0D0C0B0A0908);
2245     emit_int64(0x1716151413121110);
2246     emit_int64(0x1F1E1D1C1B1A1918);
2247     emit_int64(0x2726252423222120);
2248     emit_int64(0x2F2E2D2C2B2A2928);
2249     emit_int64(0x3736353433323130);
2250     emit_int64(0x3F3E3D3C3B3A3938);
2251 
2252     bind(k_init);
2253     lea(len, InternalAddress(tmp));
2254     // create mask to test for negative byte inside a vector
2255     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
2256     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
2257 
2258 #endif
2259     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
2260     ktestq(k2, k3);
2261     jcc(Assembler::notZero, TRUE_LABEL);
2262 
2263     jmp(FALSE_LABEL);
2264   } else {
2265     movl(result, len); // copy
2266 
2267     if (UseAVX >= 2 && UseSSE >= 2) {
2268       // With AVX2, use 32-byte vector compare
2269       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2270 
2271       // Compare 32-byte vectors
2272       andl(result, 0x0000001f);  //   tail count (in bytes)
2273       andl(len, 0xffffffe0);   // vector count (in bytes)
2274       jccb(Assembler::zero, COMPARE_TAIL);
2275 
2276       lea(ary1, Address(ary1, len, Address::times_1));
2277       negptr(len);
2278 
2279       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
2280       movdl(vec2, tmp1);
2281       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
2282 
2283       bind(COMPARE_WIDE_VECTORS);
2284       vmovdqu(vec1, Address(ary1, len, Address::times_1));
2285       vptest(vec1, vec2);
2286       jccb(Assembler::notZero, TRUE_LABEL);
2287       addptr(len, 32);
2288       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2289 
2290       testl(result, result);
2291       jccb(Assembler::zero, FALSE_LABEL);
2292 
2293       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2294       vptest(vec1, vec2);
2295       jccb(Assembler::notZero, TRUE_LABEL);
2296       jmpb(FALSE_LABEL);
2297 
2298       bind(COMPARE_TAIL); // len is zero
2299       movl(len, result);
2300       // Fallthru to tail compare
2301     } else if (UseSSE42Intrinsics) {
2302       // With SSE4.2, use double quad vector compare
2303       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2304 
2305       // Compare 16-byte vectors
2306       andl(result, 0x0000000f);  //   tail count (in bytes)
2307       andl(len, 0xfffffff0);   // vector count (in bytes)
2308       jcc(Assembler::zero, COMPARE_TAIL);
2309 
2310       lea(ary1, Address(ary1, len, Address::times_1));
2311       negptr(len);
2312 
2313       movl(tmp1, 0x80808080);
2314       movdl(vec2, tmp1);
2315       pshufd(vec2, vec2, 0);
2316 
2317       bind(COMPARE_WIDE_VECTORS);
2318       movdqu(vec1, Address(ary1, len, Address::times_1));
2319       ptest(vec1, vec2);
2320       jcc(Assembler::notZero, TRUE_LABEL);
2321       addptr(len, 16);
2322       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2323 
2324       testl(result, result);
2325       jcc(Assembler::zero, FALSE_LABEL);
2326 
2327       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2328       ptest(vec1, vec2);
2329       jccb(Assembler::notZero, TRUE_LABEL);
2330       jmpb(FALSE_LABEL);
2331 
2332       bind(COMPARE_TAIL); // len is zero
2333       movl(len, result);
2334       // Fallthru to tail compare
2335     }
2336   }
2337   // Compare 4-byte vectors
2338   andl(len, 0xfffffffc); // vector count (in bytes)
2339   jccb(Assembler::zero, COMPARE_CHAR);
2340 
2341   lea(ary1, Address(ary1, len, Address::times_1));
2342   negptr(len);
2343 
2344   bind(COMPARE_VECTORS);
2345   movl(tmp1, Address(ary1, len, Address::times_1));
2346   andl(tmp1, 0x80808080);
2347   jccb(Assembler::notZero, TRUE_LABEL);
2348   addptr(len, 4);
2349   jcc(Assembler::notZero, COMPARE_VECTORS);
2350 
2351   // Compare trailing char (final 2 bytes), if any
2352   bind(COMPARE_CHAR);
2353   testl(result, 0x2);   // tail  char
2354   jccb(Assembler::zero, COMPARE_BYTE);
2355   load_unsigned_short(tmp1, Address(ary1, 0));
2356   andl(tmp1, 0x00008080);
2357   jccb(Assembler::notZero, TRUE_LABEL);
2358   subptr(result, 2);
2359   lea(ary1, Address(ary1, 2));
2360 
2361   bind(COMPARE_BYTE);
2362   testl(result, 0x1);   // tail  byte
2363   jccb(Assembler::zero, FALSE_LABEL);
2364   load_unsigned_byte(tmp1, Address(ary1, 0));
2365   andl(tmp1, 0x00000080);
2366   jccb(Assembler::notEqual, TRUE_LABEL);
2367   jmpb(FALSE_LABEL);
2368 
2369   bind(TRUE_LABEL);
2370   movl(result, 1);   // return true
2371   jmpb(DONE);
2372 
2373   bind(FALSE_LABEL);
2374   xorl(result, result); // return false
2375 
2376   // That's it
2377   bind(DONE);
2378   if (UseAVX >= 2 && UseSSE >= 2) {
2379     // clean upper bits of YMM registers
2380     vpxor(vec1, vec1);
2381     vpxor(vec2, vec2);
2382   }
2383 }
2384 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
2385 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
2386                                       Register limit, Register result, Register chr,
2387                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
2388   ShortBranchVerifier sbv(this);
2389   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
2390 
2391   int length_offset  = arrayOopDesc::length_offset_in_bytes();
2392   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
2393 
2394   if (is_array_equ) {
2395     // Check the input args
2396     cmpoop(ary1, ary2);
2397     jcc(Assembler::equal, TRUE_LABEL);
2398 
2399     // Need additional checks for arrays_equals.
2400     testptr(ary1, ary1);
2401     jcc(Assembler::zero, FALSE_LABEL);
2402     testptr(ary2, ary2);
2403     jcc(Assembler::zero, FALSE_LABEL);
2404 
2405     // Check the lengths
2406     movl(limit, Address(ary1, length_offset));
2407     cmpl(limit, Address(ary2, length_offset));
2408     jcc(Assembler::notEqual, FALSE_LABEL);
2409   }
2410 
2411   // count == 0
2412   testl(limit, limit);
2413   jcc(Assembler::zero, TRUE_LABEL);
2414 
2415   if (is_array_equ) {
2416     // Load array address
2417     lea(ary1, Address(ary1, base_offset));
2418     lea(ary2, Address(ary2, base_offset));
2419   }
2420 
2421   if (is_array_equ && is_char) {
2422     // arrays_equals when used for char[].
2423     shll(limit, 1);      // byte count != 0
2424   }
2425   movl(result, limit); // copy
2426 
2427   if (UseAVX >= 2) {
2428     // With AVX2, use 32-byte vector compare
2429     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2430 
2431     // Compare 32-byte vectors
2432     andl(result, 0x0000001f);  //   tail count (in bytes)
2433     andl(limit, 0xffffffe0);   // vector count (in bytes)
2434     jcc(Assembler::zero, COMPARE_TAIL);
2435 
2436     lea(ary1, Address(ary1, limit, Address::times_1));
2437     lea(ary2, Address(ary2, limit, Address::times_1));
2438     negptr(limit);
2439 
2440 #ifdef _LP64
2441     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2442       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
2443 
2444       cmpl(limit, -64);
2445       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2446 
2447       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2448 
2449       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
2450       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
2451       kortestql(k7, k7);
2452       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2453       addptr(limit, 64);  // update since we already compared at this addr
2454       cmpl(limit, -64);
2455       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2456 
2457       // At this point we may still need to compare -limit+result bytes.
2458       // We could execute the next two instruction and just continue via non-wide path:
2459       //  cmpl(limit, 0);
2460       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
2461       // But since we stopped at the points ary{1,2}+limit which are
2462       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
2463       // (|limit| <= 32 and result < 32),
2464       // we may just compare the last 64 bytes.
2465       //
2466       addptr(result, -64);   // it is safe, bc we just came from this area
2467       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
2468       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
2469       kortestql(k7, k7);
2470       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2471 
2472       jmp(TRUE_LABEL);
2473 
2474       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2475 
2476     }//if (VM_Version::supports_avx512vlbw())
2477 #endif //_LP64
2478     bind(COMPARE_WIDE_VECTORS);
2479     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
2480     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
2481     vpxor(vec1, vec2);
2482 
2483     vptest(vec1, vec1);
2484     jcc(Assembler::notZero, FALSE_LABEL);
2485     addptr(limit, 32);
2486     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2487 
2488     testl(result, result);
2489     jcc(Assembler::zero, TRUE_LABEL);
2490 
2491     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2492     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
2493     vpxor(vec1, vec2);
2494 
2495     vptest(vec1, vec1);
2496     jccb(Assembler::notZero, FALSE_LABEL);
2497     jmpb(TRUE_LABEL);
2498 
2499     bind(COMPARE_TAIL); // limit is zero
2500     movl(limit, result);
2501     // Fallthru to tail compare
2502   } else if (UseSSE42Intrinsics) {
2503     // With SSE4.2, use double quad vector compare
2504     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2505 
2506     // Compare 16-byte vectors
2507     andl(result, 0x0000000f);  //   tail count (in bytes)
2508     andl(limit, 0xfffffff0);   // vector count (in bytes)
2509     jcc(Assembler::zero, COMPARE_TAIL);
2510 
2511     lea(ary1, Address(ary1, limit, Address::times_1));
2512     lea(ary2, Address(ary2, limit, Address::times_1));
2513     negptr(limit);
2514 
2515     bind(COMPARE_WIDE_VECTORS);
2516     movdqu(vec1, Address(ary1, limit, Address::times_1));
2517     movdqu(vec2, Address(ary2, limit, Address::times_1));
2518     pxor(vec1, vec2);
2519 
2520     ptest(vec1, vec1);
2521     jcc(Assembler::notZero, FALSE_LABEL);
2522     addptr(limit, 16);
2523     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2524 
2525     testl(result, result);
2526     jcc(Assembler::zero, TRUE_LABEL);
2527 
2528     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2529     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
2530     pxor(vec1, vec2);
2531 
2532     ptest(vec1, vec1);
2533     jccb(Assembler::notZero, FALSE_LABEL);
2534     jmpb(TRUE_LABEL);
2535 
2536     bind(COMPARE_TAIL); // limit is zero
2537     movl(limit, result);
2538     // Fallthru to tail compare
2539   }
2540 
2541   // Compare 4-byte vectors
2542   andl(limit, 0xfffffffc); // vector count (in bytes)
2543   jccb(Assembler::zero, COMPARE_CHAR);
2544 
2545   lea(ary1, Address(ary1, limit, Address::times_1));
2546   lea(ary2, Address(ary2, limit, Address::times_1));
2547   negptr(limit);
2548 
2549   bind(COMPARE_VECTORS);
2550   movl(chr, Address(ary1, limit, Address::times_1));
2551   cmpl(chr, Address(ary2, limit, Address::times_1));
2552   jccb(Assembler::notEqual, FALSE_LABEL);
2553   addptr(limit, 4);
2554   jcc(Assembler::notZero, COMPARE_VECTORS);
2555 
2556   // Compare trailing char (final 2 bytes), if any
2557   bind(COMPARE_CHAR);
2558   testl(result, 0x2);   // tail  char
2559   jccb(Assembler::zero, COMPARE_BYTE);
2560   load_unsigned_short(chr, Address(ary1, 0));
2561   load_unsigned_short(limit, Address(ary2, 0));
2562   cmpl(chr, limit);
2563   jccb(Assembler::notEqual, FALSE_LABEL);
2564 
2565   if (is_array_equ && is_char) {
2566     bind(COMPARE_BYTE);
2567   } else {
2568     lea(ary1, Address(ary1, 2));
2569     lea(ary2, Address(ary2, 2));
2570 
2571     bind(COMPARE_BYTE);
2572     testl(result, 0x1);   // tail  byte
2573     jccb(Assembler::zero, TRUE_LABEL);
2574     load_unsigned_byte(chr, Address(ary1, 0));
2575     load_unsigned_byte(limit, Address(ary2, 0));
2576     cmpl(chr, limit);
2577     jccb(Assembler::notEqual, FALSE_LABEL);
2578   }
2579   bind(TRUE_LABEL);
2580   movl(result, 1);   // return true
2581   jmpb(DONE);
2582 
2583   bind(FALSE_LABEL);
2584   xorl(result, result); // return false
2585 
2586   // That's it
2587   bind(DONE);
2588   if (UseAVX >= 2) {
2589     // clean upper bits of YMM registers
2590     vpxor(vec1, vec1);
2591     vpxor(vec2, vec2);
2592   }
2593 }